In [4]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict, Counter
import scipy.optimize
import numpy
import random
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
data = pd.read_csv('for_reviews_acc.csv').dropna()
data.head(5)

Unnamed: 0,user_id,item_id,playtime_forever,play_after_buy,review,fix_recommend,review_label
0,76561197970982479,22200,271,True,It's unique and worth a playthrough.,True,1
1,76561197970982479,1250,10006,True,Simple yet with great replayability. In my opi...,True,1
2,76561197970982479,43110,834,True,Great atmosphere. The gunplay can be a bit chu...,True,1
3,js41637,227300,551,True,For a simple (it's actually not all that simpl...,True,1
4,js41637,239030,349,True,Very fun little game to play when your bored o...,True,1


In [7]:
len(data)

44166

### Split Dataset

In [8]:
def splitData(X, y, p1, p2, balance=False): 
    X = np.array(X)
    y = np.array(y)

    if balance:
        total = len(y)
        n = round(len(y)*p1/2)

        idx = []
        while len(idx) < n:
            i = random.choice(range(total))
            if y[i] == 1:
                idx.append(i)
        X_test = X[idx]
        y_test = y[idx]
        X = np.delete(X, idx, 0)
        y = np.delete(y, idx, 0)
        
        idx = []
        total = len(y)
        while len(idx) < n:
            i = random.choice(range(total))
            if y[i] == 0:
                idx.append(i)
        X_test = np.concatenate((X_test,X[idx]), axis=0)
        y_test = np.concatenate((y_test,y[idx]), axis=0)
        X = np.delete(X, idx, 0)
        y = np.delete(y, idx, 0)

    else:
        sss = StratifiedShuffleSplit(n_splits=5, test_size=p1)
        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
        X = X_train
        y = y_train

    sss = StratifiedShuffleSplit(n_splits=5, test_size=p2)
    
    for train_index, test_index in sss.split(X, y):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]

    print(len(y_train), len(y_valid), len(y_test))
    print(sum([len(y_train), len(y_valid), len(y_test)]))
    return X_train, y_train, X_valid, y_valid, X_test, y_test
    

In [9]:
labels = data['fix_recommend'].to_list()

In [10]:
data_train, y_train, data_valid, y_valid, data_test, y_test = splitData(data, labels, .1, .15, False)

33786 5963 4417
44166


In [11]:
data_train[0]

array(['Hosea1811', 203140, 2893, True, 'Very profesional', True, 1],
      dtype=object)

### Index:

0:  user_id	
1:  item_id	
2:  playtime_forever	
3:  play_after_buy	
4:  review	
5:  fix_recommend	
6:  review_label

In [12]:
def evaluate(y1,y2,detail=False):
    if not detail:
        return accuracy_score(y1,y2), f1_score(y1,y2)
    else:
        TN, FP, FN, TP = confusion_matrix(y1,y2).ravel()
        BER = (FP/(TN+FP) + FN/(FN+TP))/2
        PREC = TP/(TP+FP)
        RECALL = TP/(TP+FN)
        F1 = 2*(PREC*RECALL)/(PREC+RECALL)
        Acc = accuracy_score(y1,y2)
        return TN,FP,FN,TP,BER,PREC,RECALL,F1,Acc


### 1. Feature-based regression

In [13]:
review_emo_per_game = defaultdict(list)
review_emo_per_user = defaultdict(list)

for row in data_train:
    review_emo_per_user[row[0]].append(row[-1])
    review_emo_per_game[row[1]].append(row[-1])

In [14]:
review_avg_emo_per_game = defaultdict(int)
review_avg_emo_per_user = defaultdict(int)

for user in review_emo_per_user.keys():
    rates = review_emo_per_user[user]
    review_avg_emo_per_user[user] = sum(rates)/len(rates)
for item in review_emo_per_game.keys():
    rates = review_emo_per_game[item]
    review_avg_emo_per_game[item] = sum(rates)/len(rates)

In [15]:
review_avg_emo_all_user = sum(review_avg_emo_per_user.values()) / len(review_avg_emo_per_user)
review_avg_emo_all_game = sum(review_avg_emo_per_game.values()) / len(review_avg_emo_per_game)
review_avg_emo_all_user, review_avg_emo_all_game

(0.8464718433901915, 0.8156103873978334)

In [16]:
def cleanData(data):
    # review_label, review_avg_emo_per_user, review_avg_emo_per_game
    res = []
    for row in data:
        tmp = [row[-1]]
        if row[0] in review_avg_emo_per_user:
            tmp.append(review_avg_emo_per_user[row[0]])
        else:
            print(row[0],row[1],'user miss!')
            tmp.append(review_avg_emo_all_user)
        if row[1] in review_avg_emo_per_game:
            tmp.append(review_avg_emo_per_game[row[1]])
        else:
            print(row[0],row[1],'game miss!')
            tmp.append(review_avg_emo_all_game)
        res.append(tmp)
    return res

In [17]:
train_clean = cleanData(data_train)
valid_clean = cleanData(data_valid)
test_clean = cleanData(data_test)
train_clean[:5],valid_clean[:5],test_clean[:5]

waspish 10 user miss!
jamesleong 247730 user miss!
76561198067736111 241930 user miss!
Joneza 221100 user miss!
76561198074185082 268650 user miss!
76561198068905217 220260 user miss!
sweepa 252490 user miss!
76561198060825349 257730 user miss!
ZingChickenWings 730 user miss!
CorraNinetySix 303210 user miss!
76561198072728913 209160 user miss!
hibby 238320 user miss!
4321louis 312990 user miss!
76561198050243948 72850 user miss!
76561198054566872 356670 user miss!
76561198078389003 221910 user miss!
76561198057042886 218620 user miss!
76561198106782017 39000 user miss!
joshdcsbnian 49520 user miss!
blackviper8881 4000 user miss!
Pinguinbeta18 113200 user miss!
CharlZen 730 user miss!
76561198063808374 730 user miss!
OHDAYUMMAHNIGGA 304930 user miss!
76561198064923958 345650 user miss!
andybot1 268750 user miss!
76561198075559668 208090 user miss!
didlnwzaa 730 user miss!
Interittus 354850 user miss!
jesa5 550 user miss!
76561198099036584 208090 user miss!
theanyelpes 348340 game miss!


([[1, 1.0, 0.8043478260869565],
  [1, 1.0, 0.7996289424860853],
  [1, 0.8333333333333334, 1.0],
  [1, 0.75, 1.0],
  [1, 1.0, 0.7368421052631579]],
 [[1, 0.75, 0.8085106382978723],
  [1, 1.0, 0.8421052631578947],
  [1, 1.0, 1.0],
  [1, 0.6666666666666666, 0.8300970873786407],
  [1, 0.8464718433901915, 0.9444444444444444]],
 [[1, 0.8464718433901915, 0.75],
  [0, 1.0, 0.9090909090909091],
  [1, 0.75, 0.8298319327731093],
  [1, 0.4, 0.8156103873978334],
  [1, 0.8464718433901915, 0.7267080745341615]])

In [18]:
c_list = [1e-3,1e-2,1e-1,1,10]
res1_valid = []
for c in c_list:
    # 记得再考虑 balance！！！！
    reg = linear_model.LogisticRegression(C=c, random_state=0, class_weight='balanced').fit(train_clean, y_train)
    y1_valid = reg.predict(valid_clean)
    TN1_valid, FP1_valid, FN1_valid, TP1_valid, \
    BER1_valid,PREC1_valid,RECALL1_valid,F11_valid,acc1_valid = evaluate(y_valid,y1_valid,True)
    res1_valid.append((F11_valid,acc1_valid,BER1_valid,PREC1_valid,RECALL1_valid,c))


In [19]:
pd.DataFrame(res1_valid,columns = ['F1', 'acc', 'BER', 'Precision', 'Recall','c'])

Unnamed: 0,F1,acc,BER,Precision,Recall,c
0,0.868903,0.78048,0.352726,0.925736,0.818645,0.001
1,0.863884,0.773604,0.348032,0.927473,0.808454,0.01
2,0.855628,0.762536,0.339769,0.930583,0.791848,0.1
3,0.852523,0.758343,0.338176,0.931351,0.785997,1.0
4,0.852405,0.758175,0.33827,0.931335,0.785809,10.0


In [38]:
c = 1e-3
reg = linear_model.LogisticRegression(C=c, random_state=0, class_weight='balanced').fit(train_clean, y_train)
y1_test = reg.predict(test_clean)
TN1_test, FP1_test, FN1_test, TP1_test, \
BER1_test,PREC1_test,RECALL1_test,F11_test,acc1_test = evaluate(y_test,y1_test,True)
res1_test = [F11_test, acc1_test, BER1_test, PREC1_test, RECALL1_test,c]
pd.DataFrame([res1_test],columns = ['F1', 'acc', 'BER', 'Precision', 'Recall','c'])

Unnamed: 0,F1,acc,BER,Precision,Recall,c
0,0.874849,0.788771,0.357604,0.924058,0.830616,0.001


In [21]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_clean, y_train)
RandomForestClassifier(...)


In [22]:
y2_valid = clf.predict(valid_clean)
TN2_valid, FP2_valid, FN2_valid, TP2_valid, \
BER2_valid,PREC2_valid,RECALL2_valid,F12_valid,acc2_valid = evaluate(y2_valid,y_valid,True)
res2_valid = [F12_valid, acc2_valid, BER2_valid, PREC2_valid, RECALL2_valid]
pd.DataFrame([res2_valid],columns = ['F1', 'acc', 'BER', 'Precision', 'Recall'])

Unnamed: 0,F1,acc,BER,Precision,Recall
0,0.941041,0.888647,,1.0,0.888647


In [23]:
y2_test = clf.predict(test_clean)
TN2_test, FP2_test, FN2_test, TP2_test, \
BER2_test,PREC2_test,RECALL2_test,F12_test,acc2_test = evaluate(y2_test,y_test,True)
res2_test = [F12_test, acc2_test, BER2_test, PREC2_test, RECALL2_test]
pd.DataFrame([res2_test],columns = ['F1', 'acc', 'BER', 'Precision', 'Recall'])

Unnamed: 0,F1,acc,BER,Precision,Recall
0,0.941148,0.888839,,1.0,0.888839


### 2. Recommendation-based

In [25]:
idx = [0,1,5]
X_train = data_train[:,idx]
X_valid = data_valid[:,idx]
X_test = data_test[:,idx]

### beta+gamma

In [26]:
timePerUser = defaultdict(list)
timePerItem = defaultdict(list)
for u,i,t in X_train:
    timePerUser[u].append((i,t))
    timePerItem[i].append((u,t))

N = len(X_train)
nUsers = len(timePerUser)
nItems = len(timePerItem)
users = list(timePerUser.keys())
items = list(timePerItem.keys())

alpha = np.mean(y_train)
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [27]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

def prediction(user, item):
    global userGamma
    global itemGamma
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

def unpack(theta):
    global K
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

def cost(theta, labels, lamb):
    global K
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in X_train]
    cost = MSE(predictions, labels)
    # print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

def derivative(theta, labels, lamb):
    global K
    unpack(theta)
    N = len(X_train)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in timePerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in timePerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in X_train:
        u,i = d[0], d[1]
        pred = prediction(u, i)
        diff = pred - int(d[2])
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [28]:
def prepare(Train):
    item0Count = defaultdict(int)
    item1Count = defaultdict(int)
    total0 = 0
    total1 = 0
    item0PerUser = defaultdict(set)
    item1PerUser = defaultdict(set)
    userPerItem0 = defaultdict(set)
    userPerItem1 = defaultdict(set)
    item0Set = set()
    item1Set = set()

    for u,i,p in Train:    
        if int(p):
            item1Count[i] += 1
            total1 += 1
            item1Set.add(i)
            item1PerUser[u].add(i)
            userPerItem1[i].add(u)
        else:
            item0Count[i] += 1
            total0 += 1
            item0Set.add(i)
            item0PerUser[u].add(i)
            userPerItem0[i].add(u)
            
    mostPopular0 = [(item0Count[x], x) for x in item0Count]
    mostPopular1 = [(item1Count[x], x) for x in item1Count]
    mostPopular0.sort()
    mostPopular0.reverse()
    mostPopular1.sort()
    mostPopular1.reverse()

    return mostPopular0, mostPopular1, item0PerUser, item1PerUser, userPerItem0, userPerItem1

In [29]:
mostPopular0, mostPopular1, game0PerUser, game1PerUser, userPerGame0, userPerGame1 = prepare(X_train)

In [30]:
def Jaccard(s1, s2):
    return len(s1.intersection(s2))/len(s1.union(s2))
def getPred(X, thres1, output=False):
    yPred = []
    n = round(min([len(set(mostPopular0)), len(set(mostPopular1))])*thres1)
    intersection = set([x[1] for x in mostPopular0[:n]]) & set([x[1] for x in mostPopular1[:n]])
    popularSet0 = set([x[1] for x in mostPopular0[:n]]) - intersection
    popularSet1 = set([x[1] for x in mostPopular1[:n]]) - intersection

    for u,i,_ in X:

        if i in popularSet0:
            yPred.append(0)
            continue
        
        if i in popularSet1:
            yPred.append(1)
            continue
        
        maxSimilarity0 = 0
        gSet = set(userPerGame0[i])
        for g_prime in game0PerUser[u]:
            g_priSet = set(userPerGame0[g_prime])
            similarity = Jaccard(gSet, g_priSet)
            maxSimilarity0 = max(maxSimilarity0, similarity)
        
        maxSimilarity1 = 0
        gSet = set(userPerGame1[i])
        for g_prime in game1PerUser[u]:
            g_priSet = set(userPerGame1[g_prime])
            similarity = Jaccard(gSet, g_priSet)
            maxSimilarity1 = max(maxSimilarity1, similarity)
        if maxSimilarity0 > maxSimilarity1:
            yPred.append(0)
            continue
      
        yPred.append(1)

    return yPred

### Latent Factor

In [31]:
def LF_pred(X,y):
    y_pred = []
    for u,i,_ in X:
        try:
            pred = prediction(u, i)
        except:
            if i in timePerItem:
                rs = [d[1] for d in timePerItem[i]]
                pred = np.mean(rs)
            else:
                pred = alpha
        y_pred.append(1 if pred>0 else 0)
        
    acc, f1 = evaluate(y, y_pred)

    return acc, f1

In [None]:
K_list = [0,1,2,3]
lamb_list = [1e-5,1e-4,1e-3,1e-2,1e-1]
res = []
for K in K_list:
    for lamb in lamb_list:

        # K = 2
        # lamb = 1e-5
        iter = 50

        userGamma = {}
        itemGamma = {}

        for u in timePerUser:
            userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
        for i in timePerItem:
            itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

        scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                        [0.0]*(nUsers+nItems) + # Initialize beta
                                        [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                                    derivative, 
                                    args=(y_train, lamb), 
                                    maxiter=iter, 
                                    iprint=0)

        acc, f1 = LF_pred(X_valid,y_valid)
        
        print((K, lamb, acc, f1))

In [32]:
K = 2
lamb = 1e-4
iter = 50

userGamma = {}
itemGamma = {}

for u in timePerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for i in timePerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                [0.0]*(nUsers+nItems) + # Initialize beta
                                [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                            derivative, 
                            args=(y_train, lamb), 
                            maxiter=iter, 
                            iprint=0)

acc, f1 = LF_pred(X_valid,y_valid)
acc, f1

(0.8878081502599363, 0.9405174713256869)

### Pop+Sim

In [33]:
thres_list = [0,.01,.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
for thres in thres_list:
    y_pred = []
    for u,i,_ in X_valid:
        pred = getPred([(u,i,0)], thres)[0]
        y_pred.append(1 if pred>0 else 0)
        
    acc, f1 = evaluate(y_valid, y_pred)

    print((acc, f1, thres))

(0.8836156297165856, 0.93807994289793, 0)
(0.8760690927385544, 0.933261085523345, 0.01)
(0.86047291631729, 0.9235575156192577, 0.05)
(0.8587959080999497, 0.9225105834713787, 0.1)
(0.8658393426127788, 0.9268471104608632, 0.2)
(0.8646654368606406, 0.9263618943334245, 0.3)
(0.8698641623343955, 0.9295058139534883, 0.4)
(0.873553580412544, 0.9316161799383276, 0.5)
(0.8774106993124267, 0.9338999909575911, 0.6)
(0.8757336910950864, 0.9330199764982374, 0.7)
(0.8762367935602885, 0.9333212865919769, 0.8)
(0.8755659902733524, 0.9329719963866306, 0.9)
(0.8757336910950864, 0.9330925507900677, 1)


### Latent Factor + (Pop+Sim)

In [34]:
y_pred = []
for u,i,r in X_valid:
    try:
        pred = prediction(u, i)
    except:
        pred = getPred([(u,i,0)], 0)[0]
    y_pred.append(1 if pred>0 else 0)
    
acc, f1 = evaluate(y_valid, y_pred)

acc, f1

(0.8886466543686065, 0.9410406677321967)

# Test

### Latent Factor

In [35]:
y_pred = []
for u,i,_ in X_test:
    try:
        pred = prediction(u,i)
    
    except:
        if i in timePerItem:
            rs = [d[1] for d in timePerItem[i]]
            pred = np.mean(rs)
        else:
            pred = alpha
    y_pred.append(1 if pred>0 else 0)
    
acc, f1 = evaluate(y_test, y_pred)
acc, f1

(0.8890649762282092, 0.9412329095706404)

### Pop+Sim

In [36]:
y_pred = []
for u,i,_ in X_test:
    pred = getPred([(u,i,0)], 0.1)[0]
    y_pred.append(1 if pred>0 else 0)
    
acc, f1 = evaluate(y_test, y_pred)
acc, f1

(0.8772922798279376, 0.9328043639970245)

### Latent Factor + (Pop+Sim)

In [37]:
y_pred = []
for u,i,_ in X_test:
    try:
        pred = prediction(u,i)
    except:
        pred = getPred([(u,i,0)], 0)[0]
    y_pred.append(1 if pred>0 else 0)
    
acc, f1 = evaluate(y_test, y_pred)
acc, f1

(0.8888385782205117, 0.9411482680091094)