In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
from tqdm import tqdm 

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

# Read Prediction

## Setting up Data

In [199]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [200]:
allRatings[0]

('u67805239', 'b61372131', 4)

In [201]:
len(allRatings)

200000

In [202]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]

In [203]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set)
ratingDict = {}

for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    usersPerItem[b].add(u)
    itemsPerUser[u].add(b)
    ratingDict[(u,b)] = r

In [204]:
trainRatings = [r[2] for r in ratingsTrain]
globalAverage = sum(trainRatings) * 1.0 / len(trainRatings)
globalAverage

3.6868052631578947

In [206]:
itemAverages = {}
userAverages = {}

for i in ratingsPerItem:
    rs = [r[1] for r in ratingsPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

for u in ratingsPerUser: 
    rs = [r[1] for r in ratingsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)

In [207]:
# Copied from baseline code
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (3 * totalRead)/4: break

In [208]:
# add neg entry for every user to validation set
userBooks = defaultdict(list)
userNoBooks = defaultdict(list)

for u,b,r in allRatings:  
    userBooks[u].append(b)

for u in userBooks: 
    for b in bookCount: 
        if b not in userBooks[u]: 
            userNoBooks[u].append(b)

In [209]:
# add neg entry for every user to validation set
ratingsToAdd = []
for u,b,_ in ratingsValid: 
    
    rand = random.randrange(len(userNoBooks[u]))
    book = userNoBooks[u][rand]
    userNoBooks[u].remove(book)
    
    ratingsToAdd.append((u,book,-1))
    userBooks[u].append(book)

In [210]:
ratingsValid.extend(ratingsToAdd)

## Finding good population and similarity thresholds w/ Jaccard

In [119]:
def Jaccard(s1, s2):
    numerator = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    
    return numerator / denom

In [57]:
## finding a good similarity threshold 
def sim_model1(threshold): 
    # stronger baseline (simmilarity based)
    predictions = []
    Y = []
    # get user,book combo 
    for u,b,r in ratingsValid: 
        # all books user has read from training 
        jaccardList = []
        if u in ratingsPerUser: 
            for pair in ratingsPerUser[u]:
                b2 = pair[0]
                if b == b2: continue 

                # get users who read the book
                readB = set(tup[0] for tup in ratingsPerItem[b])
                readB2 = set(tup[0] for tup in ratingsPerItem[b2])


                # compute similarity between the two books 
                sim = Jaccard(readB,readB2)
                jaccardList.append(sim)

            # make prediction 
            jaccardList.sort(reverse=True)
            #avg = sum(jaccardList) / len(jaccardList)
            #print(sum(jaccardList) / len(jaccardList))
            if jaccardList[0] > threshold: 
                predictions.append(1)
            else: 
                predictions.append(0)

            # populate Y (true read value)
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)
        else: 
            predictions.append(0)
            
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)
            
        
     # calculating accuracy 
    acc1 = [x == z for (x,z) in zip(Y,predictions)]
    acc1 = sum(acc1) / len(acc1)
    return acc1

In [58]:
possible_thresholds = np.arange(0.0,0.05,0.05)

acc3 = -1
threshold = -1 
for t in tqdm(possible_thresholds): 
    acc = sim_model1(t)
    
    if acc > acc3: 
        acc3 = acc
        threshold = t

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.71s/it]


In [59]:
print(acc3, threshold) # min similarity = 0.52915, t = 0.0
                        # avg similarity = 0.67675, t = 0.0
                        # max similarity = 0.67675, t = 0.0

0.6851 0.0


In [None]:
## find a good popularity threshold 

In [34]:
def pop_model1(threshold): 
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count / totalRead > threshold: break
        # Before: count > totalRead / 2
    
    # running prediction 
    predictions = []
    Y = []
    for u,b,r in ratingsValid: 
        if r == -1: 
            Y.append(0)
        else:
            Y.append(1)

        if b in return1: 
            predictions.append(1)
        else: 
            predictions.append(0)
    
    # calculating accuracy 
    acc1 = [x == z for (x,z) in zip(Y,predictions)]
    acc1 = sum(acc1) / len(acc1)
    return acc1
    

In [35]:
possible_thresholds = np.arange(0.0,1.0,0.05)

acc2 = -1
threshold = -1 
for t in possible_thresholds: 
    acc = pop_model1(t)
    
    if acc > acc2: 
        acc2 = acc
        threshold = t

In [36]:
print(acc2,threshold) ## accuracy = 0.75375, t = 0.75 

0.75375 0.75


In [30]:
## try running both pop and sim 
def run_model1(popT, simT): 
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count / totalRead > popT: break
        # Before: count > totalRead / 2
        
    # stronger baseline (simmilarity based)
    predictions = []
    Y = []
    # get user,book combo 
    for u,b,r in ratingsValid: 
        # all books user has read from training 
        jaccardList = []
        #alreadyRead = False
        if u in ratingsPerUser: 
            for pair in ratingsPerUser[u]:
                b2 = pair[0]
                if b == b2: 
                    #alreadyRead = True
                    #break
                    continue

                # get users who read the book
                readB = set(tup[0] for tup in ratingsPerItem[b])
                readB2 = set(tup[0] for tup in ratingsPerItem[b2])

                # compute similarity between the two books 
                sim = Jaccard(readB,readB2)
                jaccardList.append(sim)
     
            jaccardList.sort(reverse=True)
            avg = sum(jaccardList) / len(jaccardList)
                #print(sum(jaccardList) / len(jaccardList))
            if avg > simT and b in return1: 
                predictions.append(1)
            else: 
                predictions.append(0)

            # populate Y (true read value)
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)
        else: 
            if b in return1: 
                predictions.append(1)
            else: 
                predictions.append(0)
                
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)    
        
     # calculating accuracy 
    acc1 = [x == z for (x,z) in zip(Y,predictions)]
    acc1 = sum(acc1) / len(acc1)
    return acc1

In [61]:
possible_thresholds = np.arange(0.0,0.05,0.05)
acc4 = -1
threshold4 = -1 
for t in tqdm(possible_thresholds): 
    acc = run_model1(0.75, t)

    if acc > acc4: 
        acc4 = acc
        threshold4 = t
            
print(acc4, threshold4)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.40s/it]

0.7443 0.0





In [62]:
print(acc4, threshold4) # min similarity = 0.52905, t = 0.0
                        # avg similarity = 0.7444, t = 0.0
                        # max similarity = 0.7444, t = 0.0

0.7443 0.0


## Trying same pop + sim model, with Jaccard over users rather than items

In [31]:
# trying Jaccard based off items
## try running both pop and sim 
def run_model2(popT, simT): 
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count / totalRead > popT: break
        # Before: count > totalRead / 2
        
    # stronger baseline (simmilarity based)
    predictions = []
    Y = []
    # get user,book combo 
    for u,b,r in ratingsValid: 
        # all books user has read from training 
        jaccardList = []
        #alreadyRead = False
        if i in ratingsPerItem: 
            for pair in ratingsPerItem[i]:
                u2 = pair[0]
                if u == u2: 
                    #alreadyRead = True
                    #break
                    continue

                # get users who read the book
                readU = set(tup[0] for tup in ratingsPerUser[u])
                readU2 = set(tup[0] for tup in ratingsPerUser[u2])

                # compute similarity between the two books 
                sim = Jaccard(readU,readU2)
                jaccardList.append(sim)
       
            jaccardList.sort(reverse=True)
            avg = sum(jaccardList) / len(jaccardList)
                #print(sum(jaccardList) / len(jaccardList))
            if avg > simT and b in return1: 
                predictions.append(1)
            else: 
                predictions.append(0)

            # populate Y (true read value)
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)
        else: 
            if b in return1: 
                predictions.append(1)
            else: 
                predictions.append(0)
                
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)    
        
     # calculating accuracy 
    acc1 = [x == z for (x,z) in zip(Y,predictions)]
    acc1 = sum(acc1) / len(acc1)
    return acc1

In [32]:
possible_thresholds = np.arange(0.0,1.0,0.05)
acc6 = -1
threshold6 = -1 
for t in tqdm(possible_thresholds): 
    acc = run_model2(0.75, t)

    if acc > acc6: 
        acc6 = acc
        threshold6 = t

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:33<00:00,  1.65s/it]


In [33]:
print(acc6, threshold6) # min similarity = 0.52905, t = 0.0
                        # avg similarity = 0.6484, t = 0.0
                        # max similarity = 0.6484, t = 0.0

0.6484 0.0


## Running Predictions using above mentioned models

In [165]:
for u in ratingsPerUser:
    if len(ratingsPerUser[u]) == 0: 
        print(True)
        print(u)

In [175]:
## try running both pop and sim 
def run_prediction1(u,b,popT, simT): 
    # stronger baseline (simmilarity based) 
    # all books user has read from training 
    jaccardList = []
    #alreadyRead = False
    if u in ratingsPerUser: 
        for pair in ratingsPerUser[u]:
            b2 = pair[0]
            if b == b2: 
                #alreadyRead = True
                #break
                continue

            # get users who read the book
            readB = usersPerItem[b]
            readB2 = usersPerItem[b2]

            # compute similarity between the two books 
            sim = Jaccard(readB,readB2)
            jaccardList.append(sim)

            # make prediction 
            #if alreadyRead == True: 
                #predictions.append(1)
                #Y.append(1)
                #continue
                                   
        jaccardList.sort(reverse=True)
    
        #avg = sum(jaccardList) / len(jaccardList)
            #print(sum(jaccardList) / len(jaccardList))
        if jaccardList[0] > simT and b in return1: 
            return (jaccardList[0],1,u,b)
        else: 
            return (jaccardList[0],0,u,b)
    else: 
        if b in return1: 
            return (-1.0,1,u,b)
        else: 
            return (-1.0,0,u,b)

## avg and max performance the same 

In [23]:
## try running both pop and sim - Jaccard using users
def run_prediction2(u,b,popT, simT): 
    # stronger baseline (simmilarity based) 
    # all books user has read from training 
    jaccardList = []
    #alreadyRead = False
    if b in ratingsPerItem: 
        for pair in ratingsPerItem[b]:
            u2 = pair[0]
            if u == u2: 
                #alreadyRead = True
                #break
                continue

            # get books for each user 
            readU = usersPerItem[u]
            readU2 = usersPerItem[u2]

            # compute similarity between the two users
            sim = Jaccard(readU,readU2)
            jaccardList.append(sim)

            # make prediction 
            #if alreadyRead == True: 
                #predictions.append(1)
                #Y.append(1)
                #continue
                                   
        jaccardList.sort(reverse=True)
            #print(sum(jaccardList) / len(jaccardList))
        if jaccardList[0] > simT and b in return1: 
            return 1
        else: 
            return 0
    else: 
        if b in return1: 
            return 1
        else: 
            return 0


In [169]:
def run_prediction3(u,b,popT,simT): 
    # stronger baseline (simmilarity based) 
    # all books user has read from training 
    jaccardList = []
    #alreadyRead = False
    if u in ratingsPerUser: 
        for pair in ratingsPerUser[u]:
            b2 = pair[0]
            if b == b2: 
                #alreadyRead = True
                #break
                continue

            # get users who read the book
            readB = usersPerItem[b]
            readB2 = usersPerItem[b2]

            # compute similarity between the two books 
            sim = Jaccard(readB,readB2)
            jaccardList.append(sim)

            # make prediction 
            #if alreadyRead == True: 
                #predictions.append(1)
                #Y.append(1)
                #continue
                                   
        jaccardList.sort(reverse=True)
        print(u)
        #avg = sum(jaccardList) / len(jaccardList)
            #print(sum(jaccardList) / len(jaccardList))
        if jaccardList[0] > simT and b in return1: 
            return (jaccardList[0], 1, u,b) 
        else: 
            return (jaccardList[0], 0, u, b) 
    elif b in ratingsPerItem: 
        for pair in ratingsPerItem[b]:
            u2 = pair[0]
            if u == u2: 
                #alreadyRead = True
                #break
                continue

            # get books for each user 
            readU = itemsPerUser[u]
            readU2 = itemsPerUser[u2]

            # compute similarity between the two users
            sim = Jaccard(readU,readU2)
            jaccardList.append(sim)

            # make prediction 
            #if alreadyRead == True: 
                #predictions.append(1)
                #Y.append(1)
                #continue
                                   
        jaccardList.sort(reverse=True)
        
            #print(sum(jaccardList) / len(jaccardList))
        if jaccardList[0] > simT and b in return1: 
            return (jaccardList[0], 1, u,b) 
        else: 
            return (jaccardList[0], 0, u, b)
    else: 
        if b in return1: 
            return (jaccardList[0], 1, u,b)
        else: 
            return (jaccardList[0], 0, u, b)

### Attempting to leverage 50/50 ratio of data

In [172]:
#Y = []
predict1 = []
predict0 = []
for u,b,r in ratingsValid: 
    predict = run_prediction1(u,b,0.75,0.0)
    
    read = predict[1]
    
    if r == -1: 
        y = 0
    else: 
        y = 1
        
    res = (predict[0], predict[1], y)
    
    if read == 1: 
        predict1.append(res)
    else: 
        predict0.append(res)


predict1.sort(reverse=True)
predict0.sort(reverse=True) 

size = min(len(predict1), len(ratingsValid)/2)
predictions = predict1[:size]

if size < (len(ratingsValid) / 2):
    for r in predict0[0:]: 
        if size < (len(ratingsValid) / 2):
            size += 1 
            predictions.append((r[0], 1, r[2]))
        else: 
            predictions.append((r[0], 0, r[2]))
else: 
    for r in predict1[size:]:  
        predictions.append((r[0], 0, r[2]))
        
    for r in predict0[0:]: 
        predictions.append((r[0], 0, r[2]))
    
len(predictions)
count = 0
for p in predictions: 
    if p[1] == 1: 
        count += 1
count

ac = [y == z for (x,y,z) in predictions]
ac = sum(ac) / len(ac)
ac

u59070515
u05014036
u97467443
u89187024
u41880097
u43993833
u76634269
u58694848
u00634922
u45282198
u16245954
u06369864
u60891388
u97131649
u45707540
u52547627
u42941492
u03849754
u25115908
u07418091
u11722245
u08390130
u72899183
u73707745
u05471520
u72053275
u58760696
u68850164
u66245524
u81028015
u21127799
u99601091
u97273779
u40484633
u12774240
u40645893
u62456072
u34697182
u48810514
u41782494
u45265845
u21649959
u18560651
u26457338
u30077515
u11221297
u07018979
u63072182
u36151340
u14376490
u52376860
u53938779
u74126969
u10957225
u84394194
u37761260
u28345292
u34933245
u95892453
u56582166
u94861664
u12732384
u81747174
u24520689
u39315672
u37676096
u62850869
u25362272
u10044698
u14619939
u95584031
u04421730
u54068510
u83287692
u06363914
u41551166
u46829641
u75635802
u39379707
u26171985
u67899505
u30929591
u75144451
u23713024
u48667167
u52376860
u90521464
u03028767
u11701742
u64609686
u37868290
u58166327
u48368857
u61307887
u49798912
u53191000
u07966630
u19609478
u97703650
u56517868


IndexError: list index out of range

In [174]:
predictions = open("predictions_Read.csv", 'w')

predict1 = []
predict0 = []

length = 0
for l in open("pairs_Read.csv"):
    #print(l)
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    # (etc.)
    
    length += 1
    result = run_prediction1(u,b,0.75, 0.0)
    if result[1] == 1: 
        predict1.append(result)
    else: 
        predict0.append(result)

predict1.sort(reverse=True)
predict0.sort(reverse=True)

size = min(len(predict1), len(ratingsValid)/2)
predicts = predict1[:size]

if size < (len(ratingsValid) / 2):
    for r in predict0[0:]: 
        if size < (len(ratingsValid) / 2):
            size += 1 
            predicts.append((r[0], 1,r[2],r[3]))
        else: 
            predicts.append((r[0], 0,r[2],r[3]))
else: 
    for r in predict1[size:]:  
        predicts.append((r[0], 0,r[2],r[3]))
        
    for r in predict0[0:]: 
        predicts.append((r[0], 0,r[2],r[3]))
        
for p in predicts: 
    line = p[2] + "," + p[3] + "," + str(p[1]) + "\n"
    predictions.write(line)
predictions.close()


u37758667
u85626045
u64714864
u78647159
u43398119
u93156409
u85724496
u72905804
u61280144
u13480142
u17636373
u74364434
u43186229
u80022267
u15192312
u90311281
u42850585
u25080664
u36408241
u33360666
u90171128
u04133017
u01137563
u34446517
u00000539
u38043397
u12747816
u60899733
u34957828
u94306729
u77826754
u34462368
u96335901
u40217898
u74565740
u77768269
u96387203
u11612500
u31751774
u45600149
u50579932
u45534886
u58877217
u25425477
u81686813
u36634167
u55265086
u65751729
u94462479
u27514179
u60197618
u08139961
u13810664
u58900348
u66821791
u58770327
u29914565
u00371676
u00249079
u11609523
u41702260
u04644558
u15277345
u49589476
u74052290
u55339928
u45232796
u77647891
u98999041
u62980270
u67867575
u61831453
u77695392
u59864954
u19408896
u69473332
u89408914
u51578155
u04343293
u01880253
u58883116
u16773649
u94849255
u82870308
u05786215
u40818252
u31695007
u62874550
u33805215
u08458904
u62669458
u06064862
u34829814
u81632612
u20851233
u28625832
u78951421
u22411386
u80600374
u67561061


## Trying Pearson Correlation

In [82]:
# Trying pearson correlation 
def Pearson(i1, i2):
    # Between two items
    if i1 in itemAverages: 
        iBar1 = itemAverages[i1]
    else: 
        iBar1 = globalAverage
    
    if i2 in itemAverages: 
        iBar2 = itemAverages[i2]
    else: 
        iBar2 = globalAverage
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in inter: #usersPerItem[i1]:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    #for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [91]:
def run_model3(popT, simT):
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count / totalRead > popT: break
        # Before: count > totalRead / 2
        
    # stronger baseline (simmilarity based)
    predictions = []
    Y = []
    # get user,book combo 
    for u,b,r in ratingsValid: 
        # all books user has read from training 
        simList = []
        #alreadyRead = False
        if u in ratingsPerUser: 
            for pair in ratingsPerUser[u]:
                b2 = pair[0]
                if b == b2: 
                    #alreadyRead = True
                    #break
                    continue
    
               # get users who read the book
                readB = usersPerItem[b]
                readB2 = usersPerItem[b2]
                
                # compute similarity between the two books
                if b in itemAverages and b2 in itemAverages: 
                    sim = Pearson(b,b2)
                else: 
                    sim = Jaccard(readB, readB2)
               
                simList.append(sim)
      
            simList.sort(reverse=True)
            #avg = sum(simList) / len(simList)
               
            if simList[0] > simT and b in return1: 
                predictions.append(1)
            else: 
                predictions.append(0)

            # populate Y (true read value)
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)
        else: 
            if b in return1: 
                predictions.append(1)
            else: 
                predictions.append(0)
                
            if r == -1: 
                Y.append(0)
            else: 
                Y.append(1)    
        
     # calculating accuracy 
    acc1 = [x == z for (x,z) in zip(Y,predictions)]
    acc1 = sum(acc1) / len(acc1)
    return acc1

In [92]:
possible_thresholds = np.arange(0.0,1.0,0.05)
acc5 = -1
threshold5 = -1 
for t in tqdm(possible_thresholds): 
    acc = run_model3(0.75, t)

    if acc > acc5: 
        acc5 = acc
        threshold5 = t

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


In [93]:
print(acc5, threshold5) # min similarity = 0.50675, t = 0.0
                        # avg similarity = 0.6685, t = 0.0
                        # max similarity = 0.7254, t = 0.0

0.7254 0.0


## Trying feature vector + log regression

In [211]:
ratingsTrain[0]

('u67805239', 'b61372131', 4)

In [269]:
def feature(datum): 
    feat = [1]
    
    user = datum[0]
    item = datum[1]
    r = datum[2]
    
    # popularity of item 
    count = 0
    popT = 0.0
    for ic, i in mostPopular:
        count += ic
        if i == item: 
            popT = count / totalRead
            break 
    
    # MAX jaccard sim for item 
    maxIjaccard = 0.0
    if user in ratingsPerUser: 
        jaccardList = []
        for pair in ratingsPerUser[user]:
            b2 = pair[0]
            if item == b2: 
                #alreadyRead = True
                #break
                continue

            # get users who read the book
            readB = usersPerItem[item]
            readB2 = usersPerItem[b2]

            # compute similarity between the two books 
            sim = Jaccard(readB,readB2)
            jaccardList.append(sim)

        jaccardList.sort(reverse=True)
        if len(jaccardList) >= 1: 
            maxIjaccard = jaccardList[0]
        
    
    # Max jaccard sim for users 
    maxUjaccard = 0.0
    if item in ratingsPerItem:
        jaccardList = []
        for pair in ratingsPerItem[item]:
            u2 = pair[0]
            if user == u2: 
                #alreadyRead = True
                #break
                continue

            # get books for each user 
            readU = itemsPerUser[user]
            readU2 = itemsPerUser[u2]

            # compute similarity between the two users
            sim = Jaccard(readU,readU2)
            jaccardList.append(sim)
      
        jaccardList.sort(reverse=True)
        if len(jaccardList) >= 1: 
            maxUjaccard = jaccardList[0]
    
    #if popT != 0: 
        #popT = 1 / popT 
        
    return feat + [popT, maxIjaccard, maxUjaccard]
    

In [270]:
feature(ratingsTrain[0])

[1, 0.091275, 0.015355086372360844, 0.14814814814814814]

In [None]:
# train on validation set  

In [271]:
Xvalid = [feature(d) for d in tqdm(ratingsValid)]
Yvalid = [not(d[2] == -1) for d in ratingsValid]

100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [00:13<00:00, 1434.50it/s]


In [277]:
mod = linear_model.LogisticRegression(C=43.0, class_weight='balanced', verbose=True)
mod.fit(Xvalid,Yvalid)
predictions = mod.predict(Xvalid)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [278]:
predictions[:10]

array([False,  True, False,  True,  True,  True,  True, False,  True,
        True])

In [279]:
TP = sum([a == b and b == True for a,b in zip(predictions,Yvalid)])
TN = sum([a == b and b == False for a,b in zip(predictions,Yvalid)])
FP = sum([a != b and b == False for a,b in zip(predictions,Yvalid)])
FN = sum([a != b and b == True for a,b in zip(predictions,Yvalid)])
acc = [a == b for a,b in zip(predictions, Yvalid)]
acc = sum(acc) / len(acc)

BTP = TP / (TP + FN)
BTN = TN / (TN + FP)
BER = 1 - (BTP + BTN) / 2

print(TP,TN,FP,FN,BER, acc)

7058 8064 1936 2942 0.2439 0.7561


In [275]:
# looking for best c 
vals = np.arange(1.0, 50.0, 1.0)
maxAcc = 0.0 
t = 1.0 
for c in vals:
    mod = linear_model.LogisticRegression(C=c, class_weight='balanced')
    mod.fit(Xvalid,Yvalid)
    predictions = mod.predict(Xvalid)
    
    acc = [a == b for a,b in zip(predictions, Yvalid)]
    acc = sum(acc) / len(acc)
    
    if acc > maxAcc: 
        maxAcc = acc
        t = c

In [276]:
print(maxAcc, t)

0.7561 43.0


In [280]:
predictions = open("predictions_Read.csv", 'w')

for l in open("pairs_Read.csv"):
    #print(l)
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    # (etc.)
    
    length += 1
    feat = feature((u,b,-1000)) # rating doesnt matter 
    result = mod.predict([feat])
    
    pred = 0
    if result == True: 
        pred = 1

    line = u + "," + b + "," + str(pred) + "\n"
    predictions.write(line)

predictions.close()


###### IDEAS: 

1. use both, improve similarity 
    - rather than looking at the maximum similarity of a book, try avg similarity or avg similarity among 10% of total similarity entries or min 
2. try different similarity methods
3. try similarity based on Users, not items 
3. make feature vectors including 
    - popularity 
    - similarity 
    - length of word 

## Conclusions: 

1. Basic pop + sim using max(Jaccard) is very effective
2. not much difference between max(Jaccard) and avg(Jaccard) 
3. Jaccard sim over items > over users 
4. Feature vector has given best accuracy -> Winning Model

# Category Prediction 

In [5]:
import time
import nltk
from nltk.corpus import stopwords

In [6]:
nltk.download('stopwords')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [315]:
data = []
reviewsPerUser = defaultdict(list)

for d in readGz("train_Category.json.gz"):
    u = d['user_id']
    r = d['review_id']
    
    reviewsPerUser[u].append(d)

    data.append(d)

In [8]:
data[0]

{'user_id': 'u75242413',
 'review_id': 'r45843137',
 'rating': 4,
 'review_text': "a clever book with a deeply troubling premise and an intriguing protagonist. Thompson's clean, sparse prose style kept each page feeling light even as some rather heavy existential questions dropped upon them. I enjoyed it. \n and that cover design is boom-pow gorgeous.",
 'n_votes': 1,
 'genre': 'mystery_thriller_crime',
 'genreID': 3}

In [9]:
reviewTrain = data[:90000]
reviewValid = data[90000:]

In [10]:
reviewTrain[50000]

{'user_id': 'u88632301',
 'review_id': 'r75997435',
 'rating': 4,
 'review_text': 'Great book! Well written and very believeable. Enjoyed that it was set in a small town north of Albuquerque--brought back many memories. The language is realistic and not too over the top. Same theme but totally different setting than VA is used to.',
 'n_votes': 0,
 'genre': 'young_adult',
 'genreID': 4}

In [159]:
punctuation = set(string.punctuation)
punctuation.remove('!')
punctuation.remove('?')
punctuation

{'"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~'}

In [360]:
# NEW CREATIVE VERSION 
stop = stopwords.words("english")
wordCount = defaultdict(int)

wordSetPerReview = defaultdict(set)
for d in reviewTrain: 
    u = d['user_id']
    r_id = d['review_id']
    
    for w in d['review_text'].split(): 
        r = ["".join([c for c in w.lower() if not c in punctuation])]
        
        # addressing ! and ? 
        if '!' in w: 
            r = r[0].split('!')
            r.append('!')
            
        if '?' in w: 
            r = r[0].split('?')
            r.append('?')
            
       
        for word in r:    
            if word in stop: 
                continue 
            wordSetPerReview[r_id].add(word)
            wordCount[word] += 1

In [290]:
# N-gram version 
stop = stopwords.words("english")
wordCount = defaultdict(int)
for d in reviewTrain: 
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws1 = []
    
    for w in ws: 
        
        if '!' in w: 
            w2 = w.split('!')
            w2[1] = '!'
        elif '?' in w: 
            w2 = w.split('?')
            w2[1] = '?' 
        else: 
            w2 = [w]
            
        for word in w2: 
            if word in stop: 
                continue 
            ws1.append(word)
               
    #ws1 = [w for w in ws if w not in stop]
    ws2 = [' '.join(x) for x in list(zip(ws1[:-1],ws1[1:]))]
    #ws3 = [' '.join(x) for x in list(zip(ws1[:-2],ws1[1:-1],ws1[2:]))]
    #ws4 = [' '.join(x) for x in list(zip(ws1[:-3],ws1[1:-2],ws1[2:-1],ws1[3:]))]
    #ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
    for w in ws1 + ws2: #+ ws3: #+ ws4 + ws5:
        wordCount[w] += 1

SyntaxError: invalid syntax (2060060459.py, line 31)

In [286]:
wordCount

defaultdict(int,
            {'clever': 1310,
             'book': 130737,
             'deeply': 795,
             'troubling': 65,
             'premise': 1730,
             'intriguing': 1980,
             'protagonist': 2059,
             'thompsons': 34,
             'clean': 546,
             'sparse': 79,
             'prose': 1023,
             'style': 4300,
             'kept': 4669,
             'page': 4915,
             'feeling': 3479,
             'light': 2809,
             'even': 22453,
             'rather': 4448,
             'heavy': 862,
             'existential': 56,
             'questions': 2283,
             'dropped': 380,
             'upon': 1657,
             'enjoyed': 12882,
             'cover': 3115,
             'design': 225,
             'boompow': 1,
             'gorgeous': 790,
             'little': 19165,
             'much': 27500,
             'retconning': 7,
             'honest': 4520,
             'wolverines': 8,
             'past': 52

In [292]:
wordCount.pop('', None)
mostPopular = [(wordCount[w], w) for w in wordCount]
mostPopular.sort()
mostPopular.reverse()

In [293]:
len(wordCount)

267185

In [294]:
mostPopular[:10]

[(130737, 'book'),
 (62881, 'read'),
 (60204, 'story'),
 (56750, '!'),
 (55791, 'one'),
 (49981, 'like'),
 (42437, 'really'),
 (36526, 'characters'),
 (33829, 'series'),
 (33568, '?')]

In [184]:
# N gram feature 
def feature8N(datum): 
    feat = [0]*len(wordSet)
    
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws1 = []
    
    for w in ws: 
        if '!' in w: 
            w2 = w.split('!')
            w2[1] = '!'
        elif '?' in w: 
            w2 = w.split('?')
            w2[1] = '?' 
        else: 
            w2 = [w]
            
        for word in w2: 
            if word in stop: 
                continue 
            ws1.append(word)
            
    #ws1 = [w for w in ws if w not in stop]
    ws2 = [' '.join(x) for x in list(zip(ws1[:-1],ws1[1:]))]
    #ws3 = [' '.join(x) for x in list(zip(ws1[:-2],ws1[1:-1],ws1[2:]))]
    #ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
    #ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
    
    for w in ws1 + ws2: #+ ws3: #+ ws4 + ws5:
        if w in wordSet:
            index = wordId[w]
            feat[index] += 1
            
    feat.append(1) #offset
    return feat
    

In [102]:
# old feature 
def feature8O(datum): 
    f = [0]*len(wordSet)
    
    for w in datum['review_text'].split(): 
        w = "".join([c for c in w.lower() if not c in punctuation])
        
        #if w in stop: 
            #continue 
        if w in wordSet: 
            index = wordId[w]
            f[index] += 1
    
    return f + [1]

In [269]:
def Jaccard(s1, s2):
    numerator = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    
    return numerator / denom

In [380]:
# creative feature 
def feature8C(datum): 
    f = [0]*len(wordSet)
    datumWordSet = set()
    
    for w in datum['review_text'].split(): 
        r = ["".join([c for c in w.lower() if not c in punctuation])]
        
        # addressing ! and ? 
        if '!' in w: 
            r = r[0].split('!')
            r.append('!')
        
        if '?' in w: 
            r = r[0].split('?')
            r.append('?')
            
        #if w in stop: 
            #continue 
        for word in r:  
            if word in wordSet: 
                index = wordId[word]
                f[index] += 1
                
                datumWordSet.add(word)
    
    # somehow leverage user history 
    avgRating = [0.0]*5
    u = datum['user_id']
    simPerGenre = [-1.0]*5 
    if u in reviewsPerUser: 
        # find avg rating user has given for each genre and check similarities of words in reviews, take max for each genre 
        reviews = reviewsPerUser[u]
        numReviews = [0]*5
        
        
        for rev in reviews: 
            if rev['review_id'] == datum['review_id']: 
                continue
            rating = rev['rating']
            ind = rev['genreID']
            
            # avg stuff 
            avgRating[ind] += rating 
            numReviews[ind] += 1
            
            # similarity stuff
            revWordSet = wordSetPerReview[rev['review_id']]
            
            if len(revWordSet) == 0 and len(datumWordSet) == 0: 
                sim = 0.0 
                #print(rev['review_id'], datum['review_id'])
            else: 
                sim = Jaccard(revWordSet, datumWordSet)
        
            simPerGenre[ind] = max(sim, simPerGenre[ind])

        # more avg stuff 
        for val in range(0,5): 
            if numReviews[val] != 0: 
                avgRating[val] = avgRating[val] / numReviews[val]
    
    
    return f + avgRating + simPerGenre + [1]

In [344]:
data[50]

{'user_id': 'u04382180',
 'review_id': 'r20880334',
 'rating': 3,
 'review_text': "Timmy Failure is quite the character, and just doesn't seem to catch what is right in front of him, which is not the best attribute considering he is a private detective. That's right, he runs a detective agency: Total Failure Inc. \n This book was cute, with complementary drawings throughout, a la Wimpy Kid. The book is imaginative and funny, which makes for a fun read. It was a good read, and I think it will appeal to young readers.",
 'n_votes': 0,
 'genre': 'children',
 'genreID': 0}

In [345]:
reviewsPerUser['u04382180']

[{'user_id': 'u04382180',
  'review_id': 'r20880334',
  'rating': 3,
  'review_text': "Timmy Failure is quite the character, and just doesn't seem to catch what is right in front of him, which is not the best attribute considering he is a private detective. That's right, he runs a detective agency: Total Failure Inc. \n This book was cute, with complementary drawings throughout, a la Wimpy Kid. The book is imaginative and funny, which makes for a fun read. It was a good read, and I think it will appeal to young readers.",
  'n_votes': 0,
  'genre': 'children',
  'genreID': 0},
 {'user_id': 'u04382180',
  'review_id': 'r44944766',
  'rating': 4,
  'review_text': 'Typical Demetri Martin humor. Most entertaining, some super funny, some lame. A good collection I thought.',
  'n_votes': 1,
  'genre': 'comics_graphic',
  'genreID': 1},
 {'user_id': 'u04382180',
  'review_id': 'r90097035',
  'rating': 4,
  'review_text': 'I really enjoyed this story. I liked the alternate perspectives and rea

In [348]:
feature8C(data[50])[-11:]

[0.0, 4.0, 0.0, 0.0, 4.0, -1.0, 0.047619047619047616, -1.0, -1.0, 0.0, 1]

In [264]:
data[0]

{'user_id': 'u75242413',
 'review_id': 'r45843137',
 'rating': 4,
 'review_text': "a clever book with a deeply troubling premise and an intriguing protagonist. Thompson's clean, sparse prose style kept each page feeling light even as some rather heavy existential questions dropped upon them. I enjoyed it. \n and that cover design is boom-pow gorgeous.",
 'n_votes': 1,
 'genre': 'mystery_thriller_crime',
 'genreID': 3}

In [370]:
size = np.arange(10000,20000,1000)

for val in [10000]: 
    start = time.perf_counter()
    
    words = [x[1] for x in mostPopular[:val]]
    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    X = [feature8C(x) for x in data]
    y = [x['genreID'] for x in data]

    Xtrain = X[:9*len(X)//10]
    ytrain = y[:9*len(y)//10]
    Xvalid = X[9*len(X)//10:]
    yvalid = y[9*len(y)//10:]
    
    mod = linear_model.LogisticRegression(C=1, verbose=True)
    mod.fit(Xtrain,ytrain)

    pred = mod.predict(Xvalid)
    correct = [(p == l) for (p,l) in zip(pred, yvalid)]
    acc7 = sum(correct) / len(correct)
    
    final = time.perf_counter() - start 
    
    acc7

r83256483 r71808850
r62951206 r53632646
r80290091 r53632646
r24696926 r53632646
r18817821 r53632646
r83256483 r68661570


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min finished


In [371]:
wordSetPerReview['r83256483']

set()

In [383]:
print(acc7, final / 60)

# dict size = 5000 
# no punctuation but keeping ! / ? = 0.7093 
# no punctuation, keeping ! / ?, no stopwords = 0.7189 
# no punctuation, = 0.7128 
# no punctuation, no stopwords = 0.7169

# n-grams 
    # no stopwords for 1 gram, up to 2 grams - 0.6852
    # up to 3 grams - 0.68 
    # no stopwords for 1 gram, up to 5 - 0.6779 
    
    # no stopwards for all grams, up to 2 grams - 0.7066
    # "                         ", up to 2, keeping ! / ? = 0.7049 

# dict size = 10000
    #-  n gram "                         ", up to 2, keeping ! / ? = 0.7163 (8 minutes to run)
    #-  n gram "                         ", up to 3, keeping ! / ? = 0.7127 (9 minutes to run)
    #-  no punctuation, keeping ! / ?, no stopwords = 0.7344 ()
    
# extra features 
    # no punctuation sol + avg rating of genre = 0.7345, 9.5 minutes 
    # no punc, avg rating, similarity = 0.7556 

0.7556 9.164235224999993


In [None]:
## Get predictions

In [381]:
test = []
reviewPerId = defaultdict(set)
for d in readGz("test_Category.json.gz"):
    test.append(d)
    revId = d['review_id']
    
    reviewPerId[revId] = d

In [382]:
predictions = open("predictions_Category.csv", 'w')
pos = 0

for l in open("pairs_Category.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    
    review = reviewPerId[b]
    res = mod.predict([feature8C(review)])
    
    line = u + "," + b + "," + str(res[0]) + "\n"
    predictions.write(line)
    # (etc.)
predictions.close()