In [67]:
import numpy as np
import pandas as pd
import math
from random import randint
import pymysql
pymysql.install_as_MySQLdb()
from datetime import datetime, timedelta

In [2]:
def create_db_instance():
    db = pymysql.connect(
        host="writter-db.chsokvgtm8us.us-east-2.rds.amazonaws.com",
        port=3306,
        user="root",
        passwd="12345678",
        db="writter"
    )
    
    return db
    
def close_db_instance(db):
    db.close()

In [76]:
def insert_recommendations(db, recommendation):
    cur = db.cursor()
    
    sql = "INSERT INTO `writter`.`recommendation` (`created`, `enabled`, `modified`,`rating`, `story_id`, `user_id`) VALUES (%s, %s, %s, %s, %s, %s)"
    vals = (datetime.now().isoformat(), 1, datetime.now().isoformat(), recommendation['rating'], recommendation['story_id'], recommendation['user_id'])
    cur.execute(sql, vals)
    
    db.commit()
    cur.close()

    return user

In [55]:
def get_users(db):
    cur = db.cursor()

    sql = 'SELECT id, email FROM '
    sql += 'writter.users'
    
    cur.execute(sql)
    
    users = []
    
    for row in cur.fetchall():
        user = {
            'id': row[0],
            'email': row[1],
        }
        users.append(user)
    cur.close()
    return users

In [3]:
def get_rating(db):
    cur = db.cursor()

    sql = 'SELECT rating, story_id, user_id FROM '
    sql += 'writter.ratings'
    
    cur.execute(sql)
    
    ratings = []
    
    for row in cur.fetchall():
        rating = {
            'user_id': row[2],
            'story_id': row[1],
            'rating': row[0]
        }
        ratings.append(rating)

    cur.close()

    return ratings

In [4]:
db = create_db_instance()

In [9]:
ratings = get_rating(db)

In [10]:
ratings_df = pd.DataFrame(ratings)
ratings_df.head()

Unnamed: 0,rating,story_id,user_id
0,4.0,42,40
1,4.0,42,46
2,2.0,42,30
3,5.0,42,12
4,1.0,42,23


### Euclidean Distance Score 
#### Uses Euclidean Distance Score to find similar users best on how close the users data points are

In [11]:
def sim_euc(prefs, p1, p2):
    shared = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            shared[item] = 1
    
    if len(shared) == 0: return 0
    
    sum_of_squares = ([pow(prefs[p1][item] - prefs[p2][item],2) for item in shared])
    
    return 1/(1+sqrt(sum_of_squares))

### Pearson Correlation Coefficient
#### Find the Pearson Correlation Coefficient to find the similarties between users

In [12]:
def sim_pearson(prefs, p1, p2):
    shared = {}
    for item in prefs[p1]:
        if item in prefs[p2]: shared[item] = 1
    
    n = len(shared)
    
    if n == 0: return 0
    
    sum1 = sum(prefs[p1][item] for item in shared)
    sum2 = sum(prefs[p2][item] for item in shared)
    sum1Sq = sum(pow(prefs[p1][item],2) for item in shared)
    sum2Sq = sum(pow(prefs[p2][item],2) for item in shared)
    
    pSum = sum([prefs[p1][item]*prefs[p2][item] for item in shared])
    
    num = pSum-((sum1*sum2)/n)
    den = math.sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    
    if den == 0: return 0

    score = num/den
    
    return score

#### Transforms a dictionary of users and the items they rated to a dictionary of items and users who rated them.

In [13]:
def transformDict(prefs):
    result = {}
    for p1 in prefs:
        for item in prefs[p1]:
            result.setdefault(item,{})
            
            result[item][p1] = prefs[p1][item]
    return result

#### Find the top matching users for each user based on their ratings.

In [14]:
def topMatches(prefs, person, n=5, similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other) for other in prefs if other != person]

    scores.sort()
    scores.reverse()
    return scores[0:n]

### Collaborative Filter
#### Finds recommended items for each user besed on how they compare to other users who rated similar items

In [15]:
def getRecommendations(prefs, person, similarity=sim_pearson):
    count = 0
    totals = {}
    simSums = {}
    for other in prefs:
        count += 1
        if count % 10000 == 0:
            print(count)
        if other == person: continue
        sim = similarity(prefs, person, other)
        
        if sim <= 0: continue
        for item in prefs[other]:
            
            if item not in prefs[person] or prefs[person][item] == 0:
                totals.setdefault(item,0)
                totals[item] += prefs[other][item]*sim
                
                simSums.setdefault(item,0)
                simSums[item] += sim
                
    rankings = [(total/simSums[item], item) for item, total in totals.items()]
    
    rankings.sort()
    rankings.reverse()
    return rankings

### Calculate Similar Items
#### Creates the item similarties data set for the item based filter.

In [16]:
def calculateSimilarItems(prefs, n=10):
    result = {}
    
    itemPrefs = transformDict(prefs)
    count = 0
    
    for item in itemPrefs:
        count += 1
        if count % 1000 == 0: print(count/len(itemPrefs))
        
        scores = topMatches(itemPrefs, item, n=n, similarity = sim_pearson)
        result[item] = scores
    
    return result

### Item Filter
#### Uses the item similarties data set to find similar items and recommends them to users based on their other items.

In [17]:
def getRecommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    
    for (item, rating) in userRatings.items():
        for (similarity, item2) in itemMatch[item]:
            
            if item2 in userRatings: continue
                
            scores.setdefault(item2,0)
            scores[item2] += similarity*rating
            
            totalSim.setdefault(item2,0)
            totalSim[item2] += similarity
            
    rankings = [(score/totalSim[item], item) for item, score in scores.items()]
    
    rankings.sort()
    rankings.reverse()
    return rankings

#### Train test split splits the pref dictions for testing

In [19]:
def train_test_split(sampleDict, split = 0.3):
    testDict = {}
    for key in sampleDict:
        splitSize = math.floor(len(sampleDict[key])*split)
        
        randomIndexes = []

        for num in range(0, splitSize):
            num = randint(0,len(sampleDict[key]))
            while num in randomIndexes:
                num = randint(0,len(sampleDict[key]))
            randomIndexes.append(randint(0,splitSize))

        testDict.setdefault(key,{})
        indexCount = 0
        delList = []
        for i in sampleDict[key]:
            if indexCount in randomIndexes:
                testDict[key][i] = sampleDict[key][i] 
                delList.append(i)
                
                randomIndexes.remove(indexCount)
            indexCount += 1
        for ind in delList:
            del sampleDict[key][ind]
    return sampleDict, testDict

#### get the rmse for one user

In [20]:
def recommend_accuracy(recommendation, test, user):
    sim = []
    
    for item in recommendation:
        if item[1] in test[user]:
            sim.append((item[1], item[0], test[user][item[1]]))
    #print(recommendation)
    #print(test[user])
    rmse = 0
    mse = 0
    for item in sim:
        mse += pow((item[1] - item[2]),2) 
        
    if(len(sim) > 0):
        mse = mse/len(sim)
        rmse = pow(mse, 0.5)
        return rmse
    else:
        return None

#### get the accuracies for all users in the dictionary

In [21]:
def get_all_accuracies(pref, test, itemMatrix):
    total_rmse = 0
    missedCount = 0
    rmseCount = 0
    for user in test:
        userRec = getRecommendedItems(pref, itemMatrix, user)
        acc = recommend_accuracy(userRec, testDict, 3)
        
        if acc is not None:
            rmseCount += 1
            total_rmse += acc
        else:
            missedCount += 1
            
    return total_rmse/rmseCount, missedCount/len(pref)

In [22]:
def ratingToDict(user, story, rating, dictionary):
    if user in dictionary:
        dictionary[user][story] = rating
    else:
        dictionary[user] = {}
        dictionary[user][story] = rating

In [25]:
ratings_df.shape[0]

5738

In [28]:
def convertRatingDF():
    ratingDict = {}
    sampleSize = ratings_df.shape[0]
    for index, row in ratings_df[ratings_df['rating'] != -1].iterrows():
        if index % 1000 == 0:
            print((index/sampleSize))
        if index >= sampleSize:
            break
        ratingToDict(row['user_id'], row['story_id'], row['rating'], ratingDict)
    return ratingDict

In [29]:
prefDict = convertRatingDF()

0.0
0.174276751481
0.348553502963
0.522830254444
0.697107005925
0.871383757407


In [31]:
userRankings = getRecommendations(prefDict, 42, similarity=sim_pearson)

In [33]:
userRecommendations = []
for i in range (0, 15):
    story_id = userRankings[i][1] 
    userRecommendations.append((userRankings[i][0], story_id))

In [34]:
for item in userRecommendations:
    print(item)

(5.0000000000000009, 225.0)
(5.0, 460.0)
(5.0, 412.0)
(5.0, 373.0)
(5.0, 314.0)
(5.0, 274.0)
(5.0, 269.0)
(5.0, 228.0)
(4.9999999999999991, 218.0)
(4.9167076050290497, 381.0)
(4.9057418680651868, 360.0)
(4.9034627868056022, 295.0)
(4.8767653015955075, 242.0)
(4.8180499650694024, 355.0)
(4.8143934004586013, 468.0)


#### Topmatches sample output for User 42

In [35]:
userMatches = topMatches(prefDict, 42, n=15, similarity=sim_pearson)

In [36]:
print(userMatches)

[(0.35019299698185757, 26.0), (0.29437524241930929, 47.0), (0.24867151568586818, 29.0), (0.20498001542269695, 6.0), (0.19834866242949786, 31.0), (0.18106392014901079, 15.0), (0.1725591925629156, 25.0), (0.13880244223291471, 4.0), (0.12354500527419783, 44.0), (0.087532660890563579, 1.0), (0.074873411205789997, 12.0), (0.07439835775211405, 10.0), (0.07341710238075197, 7.0), (0.052482746946024443, 17.0), (0.033741528043986238, 45.0)]


In [37]:
itemMatches = calculateSimilarItems(prefDict, 50)

#### Sample output for recommendation of User 5

In [51]:
user42 = getRecommendedItems(prefDict, itemMatches, 42)
userRecommendations = []
for i in range (0, 50):
    story_id = user42[i][1] 
    userRecommendations.append((user42[i][0], story_id))
    print(userRecommendations[i])

(5.0, 334.0)
(5.0, 332.0)
(5.0, 293.0)
(5.0, 220.0)
(5.0, 204.0)
(5.0, 184.0)
(5.0, 149.0)
(5.0, 144.0)
(5.0, 138.0)
(5.0, 110.0)
(5.0, 101.0)
(5.0, 97.0)
(5.0, 87.0)
(5.0, 58.0)
(4.666666666666667, 401.0)
(4.5, 464.0)
(4.5, 451.0)
(4.5, 373.0)
(4.3333333333333339, 445.0)
(4.3333333333333339, 444.0)
(4.25, 463.0)
(4.2000000000000002, 407.0)
(4.166666666666667, 493.0)
(4.0000000000000018, 330.0)
(4.0, 484.0)
(4.0, 482.0)
(4.0, 458.0)
(4.0, 443.0)
(4.0, 425.0)
(4.0, 409.0)
(4.0, 393.0)
(4.0, 355.0)
(4.0, 352.0)
(4.0, 287.0)
(4.0, 271.0)
(4.0, 242.0)
(4.0, 217.0)
(4.0, 216.0)
(4.0, 212.0)
(4.0, 200.0)
(4.0, 189.0)
(4.0, 162.0)
(4.0, 154.0)
(4.0, 145.0)
(4.0, 128.0)
(4.0, 113.0)
(4.0, 104.0)
(4.0, 98.0)
(4.0, 62.0)
(4.0, 60.0)


In [44]:
# prefDict, testDict = train_test_split(prefDict)

In [45]:
# acc, missed = get_all_accuracies(prefDict, testDict, itemMatches)

In [46]:
# print(acc)
# print(missed)

1.80858728699
0.08163265306122448


In [63]:
itemMatches = calculateSimilarItems(prefDict, 50)

In [56]:
all_users = get_users(db)

In [80]:
count = 0
for user in all_users:
    count += 1
    print(count/len(all_users))
    try:
        user_items = getRecommendedItems(prefDict, itemMatches, user['id'])
    except:
        continue
    try:
        for user_item in user_items[:25]:
            story_id = user_item[1] 

            recommendation = {
                'rating': float(user_item[0]),
                'story_id': int(story_id),
                'user_id': int(user['id'])
            }
            insert_recommendations(db, recommendation)
    except Exception as e:
        print(e)
        continue

0.019230769230769232
0.038461538461538464
0.057692307692307696
0.07692307692307693




0.09615384615384616
0.11538461538461539
0.1346153846153846
0.15384615384615385
0.17307692307692307
0.19230769230769232
0.21153846153846154
0.23076923076923078
0.25
0.2692307692307692
0.28846153846153844
0.3076923076923077
0.3269230769230769
0.34615384615384615
0.36538461538461536
0.38461538461538464
0.40384615384615385
0.4230769230769231
0.4423076923076923
0.46153846153846156
0.4807692307692308
0.5
0.5192307692307693
0.5384615384615384
0.5576923076923077
0.5769230769230769
0.5961538461538461
0.6153846153846154
0.6346153846153846
0.6538461538461539
0.6730769230769231
0.6923076923076923
0.7115384615384616
0.7307692307692307
0.75
0.7692307692307693
0.7884615384615384
0.8076923076923077
0.8269230769230769
0.8461538461538461
0.8653846153846154
0.8846153846153846
0.9038461538461539
0.9230769230769231
0.9423076923076923
0.9615384615384616
0.9807692307692307
1.0


In [None]:
close_db_instance(db)