In [2]:
import pandas as pd
import numpy as np
from scipy.special import softmax
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [127]:
def get_files():
    df_test = pd.read_csv("~/Downloads/test_kaggle.csv")
    df_train = pd.read_csv("~/Downloads/training.csv")
    df_feature = pd.read_csv("~/Downloads/item_feature.csv")
    return df_test, df_train, df_feature

In [4]:
def merge_df(df_left,df_right,column:str):
    df_merged = df_left.merge(df_right , how='left', on=column)
    return df_merged

In [130]:
df_test, df_train, df_feature = get_files()

In [6]:
df_full = merge_df(df_train,df_feature,'item_id')
df_original = df_full.copy()
df_full = df_full.drop_duplicates()
#add '1' to all the existing records
df_full['binary_ind'] = np.array([1]*len(df_full))

In [81]:
max(df_feature['item_id'])

39900

In [8]:
def cold_start(test_set,df_train):
    cold_user = list(set(test_set['user_id'].unique()) - set(df_train['user_id'].unique()))
    #cold_item = list(set(test_set['item_id'].unique()) - set(df_train['item_id'].unique()))
    cold_item = list(set(range(0,39900)) - set(df_train['item_id'].unique()))
    return cold_user, cold_item

In [9]:
def calculate_weight(df,col,softmax_ = None,cold_item = None):
    total = len(df)
    item_count = df.groupby([col]).size().values
    if softmax_ == None:
        if cold_item == None:
            item_weight = item_count/total
        else:
            item_count_cold = list(np.array(item_count)) + list([1]*len(cold_item))
            item_weight_ = np.array(item_count_cold)/(total+len(cold_item))
            cold_item_weight = np.random.dirichlet(np.ones(len(cold_item)))
            a = 0.8
            item_weight = list(item_weight_ * a) + list(cold_item_weight*(1-a))
    else:
        if cold_item == None:
            item_weight_reversed = 1/(item_count/total)
            item_weight = softmax(item_weight_reversed)
        else:
            item_count_cold = list(np.array(item_count) + 1) + list([1]*len(cold_item))
            item_weight_reversed_cold = 1/(np.array(item_count_cold)/(total+len(cold_item)))
            item_weight = softmax(item_weight_reversed_cold)
    return item_weight

In [10]:
def calculate_weight(df,col,softmax_ = None,cold_item = None):
    total = len(df)
    item_count = df.groupby([col]).size().values
    if softmax_ == None:
        if cold_item == None:
            item_weight = item_count/total
        else:
            item_count_cold = list(np.array(item_count)) + list([1]*len(cold_item))
            item_weight = np.array(item_count_cold)/(total+len(cold_item))
            #cold_item_weight = np.random.dirichlet(np.ones(len(cold_item)))
            #a = 0.2
            #item_weight = list(item_weight_ * a) + list(cold_item_weight*(1-a))
    else:
        if cold_item == None:
            item_weight_reversed = 1/(item_count/total)
            item_weight = softmax(item_weight_reversed)
        else:
            item_count_cold = list(np.array(item_count) + 3) + list([1]*len(cold_item))
            item_weight_reversed_cold = 1/(np.array(item_count_cold)/(total+len(cold_item)))
            item_weight = softmax(item_weight_reversed_cold)
    return item_weight

In [11]:
def generate_zeros(num_zeros,item,item_weight):
    item_zeros = random.choices(item,weights = item_weight,k = num_zeros)
    return item_zeros

In [12]:
cold_user, cold_item = cold_start(df_test, df_train)

- user_id

In [76]:
# Set number of zeros
n = 2
num_zeros = len(df_full)*n

In [14]:
users = list(df_full['user_id'].unique()) + list(cold_user)
user_weight = calculate_weight(df_full,'user_id',cold_item = cold_user)
user_zeros = generate_zeros(num_zeros,users,user_weight)

In [77]:
users = df_full['user_id'].unique()
user_weight = calculate_weight(df_full,'user_id')
user_zeros = generate_zeros(num_zeros,users,user_weight)

In [533]:
items = df_original['item_id'].unique()
item_weight = calculate_weight(df_original,'item_id')
item_zeros = generate_zeros(num_zeros,items,item_weight)

- item_id

In [79]:
items = list(df_original['item_id'].unique()) + list(cold_item)
item_weight = calculate_weight(df_original,'item_id',softmax_ = 1,cold_item = cold_item)
item_zeros = generate_zeros(num_zeros,items,item_weight)

In [80]:
print(set(cold_item)-set(item_zeros))

set()


- context_feature

In [82]:
contexts = df_full['context_feature_id'].unique()
context_weight = calculate_weight(df_original,'context_feature_id',softmax_ = 1)
context_zeros = generate_zeros(num_zeros,contexts,context_weight)

- item_feature

In [83]:
item_fea = df_full['item_feature_id'].unique()
item_fea_weight = calculate_weight(df_original,'item_feature_id',softmax_ = 1)
item_fea_zeros = generate_zeros(num_zeros,item_fea,item_fea_weight)

In [19]:
df_full_zeros = pd.DataFrame({"user_id":user_zeros,"item_id":item_zeros})
df_full_zeros['binary_ind'] = np.array([0]*len(df_full_zeros))

In [84]:
df_full_zeros = pd.DataFrame({"user_id":user_zeros,"item_id":item_zeros,"context_feature_id":context_zeros,"item_feature_id":item_fea_zeros})
df_full_zeros['binary_ind'] = np.array([0]*len(df_full_zeros))

In [161]:
df_new = df_full.append(df_full_zeros,ignore_index=True)

In [635]:
len(df_full)

892371

In [636]:
len(df_full_zeros)

2677113

In [637]:
len(df_new)

3569484

In [572]:
#check if there exists duplicate
df_new.duplicated(subset=['user_id', 'item_id']).sum()

42293

In [164]:
#drop duplicate
df_new = df_new.drop_duplicates(subset=['user_id', 'item_id','context_feature_id','item_feature_id'],ignore_index=True)

In [165]:
df_new

Unnamed: 0,user_id,item_id,context_feature_id,item_feature_id,binary_ind
0,0,28366,2,7,1
1,0,16109,2,7,1
2,0,11500,3,7,1
3,0,20750,2,7,1
4,0,8759,2,7,1
...,...,...,...,...,...
2676592,155457,12580,2,47,0
2676593,29286,8635,2,186,0
2676594,117141,17020,2,91,0
2676595,89799,39313,2,168,0


In [586]:
#drop duplicate
df_new = df_new.drop_duplicates(subset=['user_id', 'item_id'], keep='first')
#df_new = df_new.drop_duplicates()

In [166]:
len(df_new[df_new['binary_ind']==1])/len(df_new[df_new['binary_ind']==0])

0.500144600515854

In [577]:
df_new = df_full.append(df_new_[df_new_['binary_ind']==0])

In [90]:
len(df_full)

892371

In [91]:
len(df_new)

2676597

In [92]:
df_new

Unnamed: 0,user_id,item_id,context_feature_id,item_feature_id,binary_ind
0,0,28366,2,7,1
1,0,16109,2,7,1
2,0,11500,3,7,1
3,0,20750,2,7,1
4,0,8759,2,7,1
...,...,...,...,...,...
2677108,155457,12580,2,47,0
2677109,29286,8635,2,186,0
2677110,117141,17020,2,91,0
2677111,89799,39313,2,168,0


In [155]:
df_new

Unnamed: 0,user_id,item_id,context_feature_id,item_feature_id,binary_ind
0,0,28366,2,7,1
1,0,16109,2,7,1
2,0,11500,3,7,1
3,0,20750,2,7,1
4,0,8759,2,7,1
...,...,...,...,...,...
2677108,155457,12580,2,47,0
2677109,29286,8635,2,186,0
2677110,117141,17020,2,91,0
2677111,89799,39313,2,168,0


In [167]:
df_new.to_feather("~/Downloads/df_new.fea")

## Encoding

In [93]:
train, val = train_test_split(df_new, test_size=0.2, random_state=42)
def encoding(train,val):
    train_user_ids = np.sort(np.unique(train.user_id.values))
    userid2idx = {o:i for i,o in enumerate(train_user_ids)}
    train["user_id"] = train["user_id"].apply(lambda x: userid2idx[x])
    val["user_id"] = val["user_id"].apply(lambda x: userid2idx.get(x, -1))
    val = val[val["user_id"] >= 0].copy()
    train_item_ids = np.sort(np.unique(train.item_id.values))
    itemid2idx = {o:i for i,o in enumerate(train_item_ids)}
    train["item_id"] = train["item_id"].apply(lambda x: itemid2idx[x])
    val["item_id"] = val["item_id"].apply(lambda x: itemid2idx.get(x, -1))
    val = val[val["item_id"] >= 0].copy()
    return train, val

In [176]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    dict_values = list(name2idx.values())
    #mean = int(sum(dict_values)/len(dict_values))
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [400]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "item_id"]: #,'context_feature_id','item_feature_id']:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
        
    return df

In [401]:
# encoding the train and validation data
train, val = train_test_split(df_new, test_size=0.2, random_state=42)
df_train = encode_data(train)
df_val = encode_data(val, train)

In [99]:
df_train

Unnamed: 0,user_id,item_id,context_feature_id,item_feature_id,binary_ind
297603,0,0,0,0,1
2027756,1,1,0,1,0
1560729,2,2,0,2,0
1872583,3,3,0,3,0
2544090,4,4,0,4,0
...,...,...,...,...,...
110268,63104,13079,0,114,1
1692832,25616,3559,0,6,0
2356667,133735,2985,0,2,0
2229367,129431,919,0,19,0


In [378]:
df_new = pd.read_feather('~/Downloads/fan.fea')

In [379]:
df_new

Unnamed: 0,user_id,item_id,binary_ind
0,0,28366,1.0
1,0,16109,1.0
2,0,11500,1.0
3,0,20750,1.0
4,0,8759,1.0
...,...,...,...
4714136,200151,6838,0.0
4714137,200151,24909,0.0
4714138,200151,15868,0.0
4714139,200151,33035,0.0


## Model

In [392]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    users = torch.LongTensor(df_train.user_id.values) # .cuda()
    items = torch.LongTensor(df_train.item_id.values) #.cuda()
    #context = torch.LongTensor(df_train.context_feature_id.values)
    #item_fea = torch.LongTensor(df_train.item_feature_id.values)
    for i in range(epochs): 
        #print(ratings)
        model.train()
        ratings = torch.FloatTensor(df_train.binary_ind.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        #print(y_hat)
        loss = F.binary_cross_entropy(torch.sigmoid(y_hat), ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [393]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.user_id.values) #.cuda()
    items = torch.LongTensor(df_val.item_id.values) #.cuda()
    #context = torch.LongTensor(df_val.context_feature_id.values)
    #item_fea = torch.LongTensor(df_val.item_feature_id.values)
    ratings = torch.FloatTensor(df_val.binary_ind.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.binary_cross_entropy(torch.sigmoid(y_hat), ratings)
    print("loss %.3f " % loss.item())

In [233]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items,num_item_fea, num_context_fea,emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_fea_emb = nn.Embedding(num_item_fea, emb_size)
        self.context_fea_emb = nn.Embedding(num_context_fea, emb_size)
        
#         self.user_emb.weight.data.uniform_(0, 0.05)  # .cuda()
#         self.item_emb.weight.data.uniform_(0, 0.05)
#         self.item_fea_emb.weight.data.uniform_(0, 0.05)
#         self.context_fea_emb.weight.data.uniform_(0, 0.05)
        
        
        
        self.lin1 = nn.Linear(emb_size*4, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
        
        
    def forward(self, u, v,a,b):
        
        U = self.user_emb(u)
        V = self.item_emb(v)
        A = self.item_fea_emb(a)
        B = self.context_fea_emb(b)
        
        x = F.relu(torch.cat([U, V,A,B], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
#         U = []
#         mean = torch.mean(model.user_emb.weight,0).detach().numpy()
#         for item in u.detach().numpy():
#             if item == -1:
#                 U.append([mean])
#                 #print(len(mean))
#             else:
#                 U.append(model.user_emb(torch.LongTensor([item])).detach().numpy())
#                 #print(len(model.user_emb(torch.LongTensor([item])).detach().numpy()[0]))
#         U = torch.tensor(U).squeeze(1)
        
#         V = []
#         mean = torch.mean(model.item_emb.weight,0).detach().numpy()
#         for item in v.detach().numpy():
#             if item == -1:
#                 V.append([mean])
#                 #print(len(mean))
#             else:
#                 V.append(model.item_emb(torch.LongTensor([item])).detach().numpy())
#                 #print(len(model.user_emb(torch.LongTensor([item])).detach().numpy()[0]))
#         V = torch.tensor(V).squeeze(1)
        
#         A = self.item_fea_emb(a)
        
#         B = []
#         mean = torch.mean(model.context_fea_emb.weight,0).detach().numpy()
#         for item in b.detach().numpy():
#             if item == -1:
#                 B.append([mean])
#                 #print(len(mean))
#             else:
#                 B.append(model.context_fea_emb(torch.LongTensor([item])).detach().numpy())
#                 #print(len(model.user_emb(torch.LongTensor([item])).detach().numpy()[0]))
#         B = torch.tensor(B).squeeze(1)
        
#         x = F.relu(torch.cat([U, V,A,B], dim=1))
#         x = self.drop1(x)
#         x = F.relu(self.lin1(x))
#         x = self.lin2(x)
        return x

In [382]:
val

Unnamed: 0,user_id,item_id,context_feature_id,item_feature_id,binary_ind
1240356,15724,22802,2,32,0
1473099,37831,24407,2,124,0
169654,38037,2989,2,2,1
987783,198170,18080,2,12,0
772495,173157,20726,2,95,1
...,...,...,...,...,...
2160984,108254,26897,2,179,0
2386461,147552,33545,2,32,0
1069176,23202,11026,2,169,0
890953,199817,1898,1,148,1


In [221]:
u = torch.LongTensor([0,1,2,-1])

In [226]:
torch.LongTensor(df_train['user_id'].values)

tensor([     0,      1,      2,  ..., 133735, 129431, 126226])

In [227]:
model.user_emb(torch.LongTensor(df_train['user_id'].values))

tensor([[-0.6605, -1.1674,  0.2038,  ...,  0.7557,  0.0798,  0.3406],
        [-1.1995,  0.4573,  0.4943,  ...,  0.2469,  0.9524, -1.2122],
        [-0.2465, -1.7332,  0.1873,  ..., -0.9301,  0.4131,  1.2513],
        ...,
        [-0.8346, -0.2193,  0.5832,  ...,  0.9770, -0.4874, -0.6197],
        [-1.3966, -0.2259, -1.2421,  ..., -0.3621, -0.3839, -0.1239],
        [-0.2818, -0.7919, -0.3283,  ...,  0.9242, -0.0924,  0.4773]],
       grad_fn=<EmbeddingBackward0>)

In [229]:
model.user_emb(torch.LongTensor([0]))

tensor([[-0.6605, -1.1674,  0.2038,  0.8967,  1.3795, -0.7050, -0.1980,  0.3467,
          0.9825,  1.2013, -0.9593, -1.1596, -0.7831, -0.5456, -0.8601, -0.3027,
          1.3354, -0.7540,  0.7131, -1.6415, -0.2090, -0.6988,  0.5370,  1.9874,
         -0.7472,  0.2855,  0.9081,  1.1746,  0.1904,  1.3448,  1.5099, -0.0380,
         -0.7210,  1.3514, -0.2194, -0.1066,  0.0865,  0.1316,  0.1276, -0.9654,
          0.2661,  2.4915,  0.5309, -0.8298,  0.6366, -0.0697,  0.1135,  0.7557,
          0.0798,  0.3406]], grad_fn=<EmbeddingBackward0>)

In [286]:
Emb = model.user_emb.weight.detach().numpy()
Emb_list = list(Emb)
Emb_list.append(np.mean(Emb,axis = 0))
Emb_array = torch.tensor(np.array(Emb_list))
Emb_array

tensor([[ 7.0636e-03,  9.5519e-01, -6.8614e-02,  ...,  4.9666e-02,
          5.3381e-02, -6.5271e-01],
        [-9.4355e-02,  3.5173e-01, -1.2058e-01,  ...,  1.5775e-01,
         -2.4216e-02, -4.7389e-01],
        [-9.3705e-02,  1.3516e-01,  5.0344e-01,  ...,  4.0221e-03,
         -5.6150e-02,  1.0941e-01],
        ...,
        [ 3.5648e-02, -1.2551e-01, -8.9256e-02,  ...,  2.6451e-01,
         -9.0519e-01,  8.6243e-02],
        [ 1.0231e-01, -1.0975e-01, -7.7309e-03,  ..., -9.9597e-02,
          2.4119e-01, -1.3241e-01],
        [-9.3046e-04,  7.2712e-04,  1.3854e-03,  ...,  5.7795e-04,
         -6.9521e-04, -9.8729e-04]])

In [292]:
model.user_emb(torch.LongTensor([0])).detach().numpy()

array([[ 0.00706361,  0.9551858 , -0.06861369,  0.1324953 ,  0.09495371,
        -0.06217994,  0.13216399, -0.11910537,  0.07230283, -0.02031904,
        -0.09317045, -0.13005337, -0.12970406,  0.16936032, -0.0345384 ,
         0.01394959,  0.03822183, -0.32911372,  0.08000132, -0.0549654 ,
         0.07476093, -0.12578201,  1.2152717 ,  0.03787008, -0.0752846 ,
        -0.06954133,  0.0322592 ,  0.09666593,  0.04684403,  0.0394599 ,
         0.00941906,  0.20466527,  0.03166532,  0.06737769,  0.0037127 ,
        -0.13162939, -0.11337183, -0.13519347, -0.19189337, -0.047182  ,
         0.12564524, -0.1426532 , -0.39268652,  0.09779441, -0.12819865,
         0.06210697, -1.1760359 ,  0.04966637,  0.05338109, -0.6527113 ]],
      dtype=float32)

In [375]:
u = torch.LongTensor([0,-1,0,2]).unsqueeze(1)
u = u.detach().numpy()

In [368]:
u = np.array([[-1],[0],[2],[0]])

In [369]:
np.where(u==[0],1,u)

array([[-1],
       [ 1],
       [ 2],
       [ 1]])

In [370]:
# U = []
mean = np.array(torch.mean(model.user_emb.weight,0).detach().numpy())
print(mean)
# for item in u.detach().numpy():
#     if item == -1:
#         U.append([mean])
#         #print(len(mean))
#     else:
#         U.append(model.user_emb(torch.LongTensor([item])).detach().numpy())
#         #print(len(model.user_emb(torch.LongTensor([item])).detach().numpy()[0]))
# U = torch.tensor(U).squeeze(1)

U = np.where(u==-1,mean,model.user_emb(torch.LongTensor(abs(u))).squeeze(1).detach().numpy())
U

[-9.3046715e-04  7.2711747e-04  1.3853624e-03  3.6512476e-05
  8.4932608e-04  1.1896073e-03  7.6148892e-04 -9.9453947e-04
 -3.1706100e-04  1.2763403e-03  8.6601742e-04  3.1000955e-04
 -1.0469344e-03 -1.4636462e-03 -1.2040888e-03  9.5022196e-04
  1.2626518e-03  6.2008703e-04 -3.8345918e-04  8.0275151e-04
 -5.4261647e-04 -4.7348769e-04 -1.4779484e-04 -3.2133271e-04
 -8.8907615e-04 -3.3931027e-04  2.2127004e-03  5.3652585e-04
  6.1696745e-04  7.8334118e-04  3.3010598e-04 -1.8651549e-03
 -4.4089442e-04 -1.2917304e-03  2.1513492e-04 -1.2569207e-04
 -1.3493723e-03 -6.9286430e-04 -1.2462626e-03  9.0885494e-04
 -3.9829451e-04  1.5128667e-04  2.4075997e-03 -1.2329570e-04
  6.4461835e-04 -1.4554444e-03  2.6778123e-04  5.7795149e-04
 -6.9519709e-04 -9.8729820e-04]


array([[-9.30467155e-04,  7.27117469e-04,  1.38536235e-03,
         3.65124761e-05,  8.49326083e-04,  1.18960731e-03,
         7.61488918e-04, -9.94539470e-04, -3.17061000e-04,
         1.27634034e-03,  8.66017421e-04,  3.10009549e-04,
        -1.04693440e-03, -1.46364619e-03, -1.20408880e-03,
         9.50221962e-04,  1.26265176e-03,  6.20087027e-04,
        -3.83459177e-04,  8.02751514e-04, -5.42616472e-04,
        -4.73487686e-04, -1.47794839e-04, -3.21332715e-04,
        -8.89076153e-04, -3.39310267e-04,  2.21270043e-03,
         5.36525855e-04,  6.16967445e-04,  7.83341180e-04,
         3.30105977e-04, -1.86515495e-03, -4.40894422e-04,
        -1.29173044e-03,  2.15134918e-04, -1.25692066e-04,
        -1.34937232e-03, -6.92864298e-04, -1.24626257e-03,
         9.08854941e-04, -3.98294505e-04,  1.51286673e-04,
         2.40759971e-03, -1.23295700e-04,  6.44618354e-04,
        -1.45544438e-03,  2.67781230e-04,  5.77951490e-04,
        -6.95197086e-04, -9.87298205e-04],
       [ 7.06

In [337]:
np.sum([-1])

-1

In [231]:
U = []
mean = torch.mean(model.user_emb.weight,0).detach().numpy()
for item in u.detach().numpy():
    if item == -1:
        U.append([mean])
        #print(len(mean))
    else:
        print(torch.LongTensor(item))
        U.append(model.user_emb(torch.LongTensor([item])).detach().numpy())
        #print(len(model.user_emb(torch.LongTensor([item])).detach().numpy()[0]))
torch.tensor(U).squeeze(1)

tensor([], dtype=torch.int64)
tensor([4294967296])
tensor([3, 0])


tensor([[-6.6046e-01, -1.1674e+00,  2.0375e-01,  8.9669e-01,  1.3795e+00,
         -7.0502e-01, -1.9801e-01,  3.4674e-01,  9.8246e-01,  1.2013e+00,
         -9.5926e-01, -1.1596e+00, -7.8308e-01, -5.4561e-01, -8.6012e-01,
         -3.0275e-01,  1.3354e+00, -7.5404e-01,  7.1305e-01, -1.6415e+00,
         -2.0901e-01, -6.9880e-01,  5.3696e-01,  1.9874e+00, -7.4722e-01,
          2.8548e-01,  9.0810e-01,  1.1746e+00,  1.9041e-01,  1.3448e+00,
          1.5099e+00, -3.7982e-02, -7.2103e-01,  1.3514e+00, -2.1938e-01,
         -1.0662e-01,  8.6543e-02,  1.3161e-01,  1.2757e-01, -9.6535e-01,
          2.6611e-01,  2.4915e+00,  5.3086e-01, -8.2981e-01,  6.3665e-01,
         -6.9730e-02,  1.1347e-01,  7.5574e-01,  7.9842e-02,  3.4062e-01],
        [-1.1995e+00,  4.5728e-01,  4.9427e-01,  1.5647e+00, -1.5613e+00,
         -1.1082e+00,  1.5047e+00, -1.2588e+00,  1.4112e+00,  8.0507e-01,
          2.4235e+00,  2.5541e+00, -6.5966e-01,  1.1346e+00,  1.6362e+00,
          6.9272e-01,  9.9824e-01,  3

In [172]:
a = torch.LongTensor([1,2,3]).detach().numpy()
1 in a

True

In [234]:
model = MF(num_users, num_items, emb_size=100) #.cuda()
train_epocs(model,df_train,df_val, epochs=15, lr=0.05, wd=1e-6) 

NameError: name 'MF' is not defined

In [402]:
num_users = len(df_train.user_id.unique())
num_items = len(df_train.item_id.unique())
#num_item_fea = len(df_train.item_feature_id.unique())
#num_context_fea = len(df_train.context_feature_id.unique())

In [236]:
torch.cuda.is_available()

False

In [403]:
model = CollabFNet(num_users, num_items,emb_size=50) #.cuda()
train_epocs(model, epochs=20, lr=0.05, wd=1e-6, unsqueeze=True) 

0.8552946448326111
0.5629643201828003
0.4775201678276062
0.5141378045082092
0.500086784362793
0.45948779582977295
0.4339931309223175
0.42933255434036255
0.42655718326568604
0.4119907319545746
0.38687828183174133
0.35917001962661743
0.3358326256275177
0.31833502650260925
0.30361220240592957
0.28796330094337463
0.2700236141681671
0.25012272596359253
0.2297922670841217
0.21022118628025055
loss 0.192 


In [594]:
model = CollabFNet(num_users, num_items, emb_size=100)#.cuda()
train_epocs(model, epochs=20, lr=0.05, wd=1e-6, unsqueeze=True) 

0.671840250492096
0.6205050349235535
0.5567648410797119
0.5490132570266724
0.553680956363678
0.5453387498855591
0.5256995558738708
0.5014596581459045
0.48050758242607117
0.4636409878730774
0.44171759486198425
0.40827977657318115
0.3666671812534332
0.3241528570652008
0.2852347195148468
0.2533109784126282
0.22931331396102905
0.21186359226703644
0.19952809810638428
0.1905786544084549
loss 0.183 


In [513]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
train_epocs(model, epochs=15, lr=0.005, wd=1e-6, unsqueeze=True) 

0.6950045824050903
0.6925733685493469
0.6912747025489807
0.6897596716880798
0.6883002519607544
0.68658447265625
0.6846222281455994
0.6820521354675293
0.6785567402839661
0.6741604804992676
0.6695421934127808
0.6648101806640625
0.6587973833084106
0.6528801321983337
0.6456193327903748
loss 0.635 


In [514]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
train_epocs(model, epochs=15, lr=0.1, wd=1e-6, unsqueeze=True) 

0.6953813433647156
2.4838826656341553
0.674903392791748
0.6953098177909851
0.6969735622406006
0.6984870433807373
0.699672281742096
0.7004452347755432
0.7007884979248047
0.7007297277450562
0.7003265023231506
0.6996550559997559
0.6987984776496887
0.6978409886360168
0.6968597173690796
loss 0.696 


In [515]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
train_epocs(model, epochs=25, lr=0.05, wd=1e-6, unsqueeze=True) 

0.6956307888031006
0.8264777660369873
0.6910773515701294
0.6932404637336731
0.6934601068496704
0.6937193274497986
0.6939666271209717
0.6941694021224976
0.6943103671073914
0.6943824291229248
0.6943871378898621
0.6943321228027344
0.6942289471626282
0.6940910220146179
0.6939330101013184
0.6937686204910278
0.6936101317405701
0.6934672594070435
0.6933475136756897
0.6932550072669983
0.6931911706924438
0.6931547522544861
0.6931428909301758
0.6931506395339966
0.6931721568107605
loss 0.693 


In [516]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
train_epocs(model, epochs=25, lr=0.01, wd=1e-6, unsqueeze=True) 

0.7006629109382629
0.7214236855506897
0.6913665533065796
0.6937902569770813
0.6965742707252502
0.6969781517982483
0.6957098841667175
0.692736804485321
0.6885350346565247
0.6846222877502441
0.6819278001785278
0.6803528070449829
0.6789910197257996
0.6767967343330383
0.6729539036750793
0.6673202514648438
0.6604852676391602
0.653156042098999
0.6459986567497253
0.6387197971343994
0.6300086379051208
0.6196205019950867
0.608133852481842
0.5958212614059448
0.5821130275726318
loss 0.563 


In [517]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
train_epocs(model, epochs=25, lr=0.1, wd=1e-6, unsqueeze=True) 

0.6940291523933411
1.0885980129241943
1.4890618324279785
1.1031248569488525
0.7476983070373535
0.6603927612304688
0.6910495162010193
0.69443678855896
0.6936636567115784
0.6932409405708313
0.6931678652763367
0.693359375
0.6937271356582642
0.6941896080970764
0.6946820020675659
0.6951497793197632
0.6955531239509583
0.6958610415458679
0.6960332989692688
0.6959249377250671
0.6951722502708435
0.6923417448997498
0.6804497241973877
0.6440902948379517
0.5929275155067444
loss 0.558 


In [518]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()
train_epocs(model, epochs=25, lr=0.001, wd=1e-6, unsqueeze=True) 

0.6933286190032959
0.6926804780960083
0.6921373009681702
0.6915972828865051
0.691122829914093
0.6906277537345886
0.690036416053772
0.6894194483757019
0.6887574791908264
0.6880629658699036
0.6872885227203369
0.6864635944366455
0.685598611831665
0.6847065687179565
0.6837458610534668
0.6827284693717957
0.6816368103027344
0.6805155873298645
0.6793432235717773
0.6781958937644958
0.6768791675567627
0.6756259799003601
0.6742296814918518
0.6727902889251709
0.671342670917511
loss 0.669 


In [131]:
df_test = df_test.merge(df_feature, how='left', on='item_id')

In [132]:
df_test

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id
0,0,4,16835,2,142
1,1,4,22590,3,142
2,2,4,1978,1,142
3,3,4,28916,1,148
4,4,4,14427,2,63
...,...,...,...,...,...
381380,381380,200151,1702,1,139
381381,381381,200151,21632,1,130
381382,381382,200151,30477,1,130
381383,381383,200151,30477,1,130


In [133]:
#df_train = encode_data(df_new)
df_val = encode_data(df_test, train)

In [134]:
df_val

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id
0,0,84785,9715,0,15
1,1,84785,12694,2,15
2,2,84785,2883,1,15
3,3,84785,35203,1,22
4,4,84785,16321,0,12
...,...,...,...,...,...
381380,381380,84785,5144,1,20
381381,381381,84785,966,1,54
381382,381382,84785,2334,1,54
381383,381383,84785,2334,1,54


In [500]:
def train_epocs_final(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.user_id.values) # .cuda()
        items = torch.LongTensor(df_train.item_id.values) #.cuda()
        ratings = torch.FloatTensor(df_train.binary_ind.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.binary_cross_entropy(torch.sigmoid(y_hat), ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 

In [501]:
num_users = len(df_train.user_id.unique())
num_items = len(df_train.item_id.unique())

In [410]:
model.eval()
users = torch.LongTensor(df_val.user_id.values)  # .cuda()
items = torch.LongTensor(df_val.item_id.values)  # .cuda()
rating = torch.sigmoid(model(users,items)).detach().numpy()

In [411]:
rating.mean()

0.38027912

In [409]:
df_val = df_test
df_val = encode_data(df_val,train)

In [407]:
df_val = df_test

In [405]:
rating.mean()

0.11063517

In [97]:
num_users = len(train.user_id.unique())
num_items = len(train.item_id.unique())
num_item_features = len(train.item_feature_id.unique())

In [None]:
train,test = encoding(df_new,df_test)

In [138]:
model.eval()
users = torch.LongTensor(df_val.user_id.values) #.cuda()
items = torch.LongTensor(df_val.item_id.values) #.cuda()
context = torch.LongTensor(df_val.context_feature_id.values)
item_fea = torch.LongTensor(df_val.item_feature_id.values)
rating = torch.sigmoid(model(users,items,item_fea,context)).detach().numpy()

In [195]:
model.eval()
users = torch.LongTensor(test.user_id.values)  # .cuda()
items = torch.LongTensor(test.item_id.values)  # .cuda()
rating = torch.sigmoid(model(users,items)).detach().numpy()

In [139]:
rating.mean()

0.9614493

In [193]:
df_val

Unnamed: 0,id,user_id,item_id,context_feature_id
30,30,7,1330,1
31,31,7,8139,1
32,32,7,10370,1
33,33,7,21772,1
34,34,7,1330,1
...,...,...,...,...
381380,381380,169492,2041,1
381381,381381,169492,265,1
381382,381382,169492,3131,1
381383,381383,169492,3131,1


In [412]:
test_copy = pd.read_csv("~/Downloads/test_kaggle.csv")
df_val['rating'] = rating
subm_df = df_val.merge(test_copy,on = 'id',how='right')
subm_df["rating"].fillna(np.mean(rating), inplace = True)

In [413]:
subm_df["rating"].mean()

0.38023993372917175

In [198]:
np.mean(subm_df["rating"])

0.6252628564834595

In [415]:
subm_df = subm_df[['id','rating']]

In [295]:
subm_df.to_csv('~/Downloads/ai_.csv',index=False)

In [4]:
df_fan = pd.read_csv('~/Downloads/test_50.csv')

In [169]:
rating_updated = (df_fan['rating'].values + subm_df['rating'].values)/2
subm_df['rating'] = rating_updated


In [416]:
subm_df.to_csv('~/Downloads/md.csv',index=False)

In [414]:
subm_df.groupby(['rating']).size()

rating
0.001454        1
0.001470        1
0.001530        1
0.001536        1
0.001536        1
            ...  
0.526051        1
0.526052        1
0.526052        1
0.526052        1
0.526056    24965
Length: 322665, dtype: int64

## Try Add RF

In [315]:
enc = preprocessing.OneHotEncoder(handle_unknown = 'ignore')

In [330]:
df_rf = df_new[['item_id','context_feature_id','item_feature_id']]
df_rf

Unnamed: 0,item_id,context_feature_id,item_feature_id
0,28366,2,7
1,16109,2,7
2,11500,3,7
3,20750,2,7
4,8759,2,7
...,...,...,...
892366,11393,2,47
892367,627,2,180
892368,29876,2,180
892369,14709,2,6


In [331]:
enc.fit(df_rf)

OneHotEncoder(handle_unknown='ignore')

In [None]:
onehotlabels = enc.transform(df_rf).toarray()
onehotlabels.shape

In [320]:
Y_train = df_new['binary_ind']

In [321]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1746609, n_features=187,n_informative=2, n_redundant=0,random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(onehotlabels, Y_train)

RandomForestClassifier(max_depth=5, random_state=0)

In [322]:
test_copy = pd.read_csv("~/Downloads/test_kaggle.csv")

In [323]:
test = merge_df(test_copy,df_feature,'item_id')
test

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id
0,0,4,16835,2,142
1,1,4,22590,3,142
2,2,4,1978,1,142
3,3,4,28916,1,148
4,4,4,14427,2,63
...,...,...,...,...,...
381380,381380,200151,1702,1,139
381381,381381,200151,21632,1,130
381382,381382,200151,30477,1,130
381383,381383,200151,30477,1,130


In [324]:
X_test = enc.transform(test[['context_feature_id','item_feature_id']])

In [325]:
rating_ = clf.predict_proba(X_test)
rating_ = [i[1] for i in rating_]

In [326]:
np.array(rating_).mean()

0.6285880908934081

In [327]:
last_df['rating'] = rating_

In [3]:
last_df = pd.read_csv('~/Downloads/wuuhuu.csv')

In [5]:
rating_updated = (df_fan['rating'].values + last_df['rating'].values)/2
last_df['rating'] = rating_updated

In [7]:
last_df.to_csv('~/Downloads/jiayou.csv',index=False)

In [9]:
last_df['rating'].mean()

0.5993375468072648

In [10]:
rating_updated.mean()

0.5993375468072624

In [11]:
df_fan['rating'].mean()

0.5700870027211178

In [298]:
last_df

Unnamed: 0,id,rating
0,0,0.385454
1,1,0.563619
2,2,0.552722
3,3,0.537018
4,4,0.347506
...,...,...
381380,381380,0.565243
381381,381381,0.481717
381382,381382,0.481717
381383,381383,0.481717


In [58]:
df_bu = pd.read_csv('~/Downloads/gn.csv')

In [61]:
wuhu = df_bu['rating'].values

In [65]:
import math

In [75]:
-math.log((1 /0.567774) - 1)

0.2727748526210699

In [70]:
0.5677805

array([0.5677805, 0.5677805, 0.5677805, ..., 0.5677805, 0.5677805,
       0.5677805])

In [72]:
df_bu.groupby('rating').size()

rating
0.002564         1
0.002609         1
0.002634         1
0.002651         1
0.002654         1
             ...  
0.567772         1
0.567774         1
0.567775         1
0.567775         1
0.567781    359614
Length: 19617, dtype: int64