# Connecting to Kaggle

In [None]:
!pip install -q kaggle

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (2).json


{'kaggle.json': b'{"username":"roozbehbazargani","key":"45e99560e68f176e179f7f730dfec506"}'}

In [3]:
!mkdir -p ~/.kaggle

In [4]:
!cp kaggle.json ~/.kaggle/

In [5]:
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [6]:
!kaggle datasets download -d netflix-inc/netflix-prize-data

netflix-prize-data.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
!ls

 combined_data_1.txt  'kaggle (1).json'   netflix-prize-data.zip   sample_data
 combined_data_2.txt  'kaggle (2).json'   probe.txt
 combined_data_3.txt   kaggle.json	  qualifying.txt
 combined_data_4.txt   movie_titles.csv   README


In [13]:
!unzip netflix-prize-data.zip

Archive:  netflix-prize-data.zip
replace README? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


# Import

In [9]:
!pip install scikit-surprise
import os
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split



# Reading Dataset

In [14]:
def readFile(file_path, rows=100000):
    data_dict = {'Cust_Id' : [], 'Movie_Id' : [], 'Rating' : [], 'Date' : []}
    f = open(file_path, "r")
    count = 0
    for line in f:
        count += 1
        if count > rows:
            break
            
        if ':' in line:
            movidId = line[:-2] # remove the last character ':'
            movieId = int(movidId)
        else:
            customerID, rating, date = line.split(',')
            data_dict['Cust_Id'].append(customerID)
            data_dict['Movie_Id'].append(movieId)
            data_dict['Rating'].append(rating)
            data_dict['Date'].append(date.rstrip("\n"))
    f.close()
            
    return pd.DataFrame(data_dict)

In [15]:
df1 = readFile('combined_data_1.txt', rows=100000)
df2 = readFile('combined_data_2.txt', rows=100000)
df3 = readFile('combined_data_3.txt', rows=100000)
df4 = readFile('combined_data_4.txt', rows=100000)
df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

In [16]:
df = df1.copy()
df = df.append(df2)
df = df.append(df3)
df = df.append(df4)
df.index = np.arange(0,len(df))
df.head(10)

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1488844,1,3.0,2005-09-06
1,822109,1,5.0,2005-05-13
2,885013,1,4.0,2005-10-19
3,30878,1,4.0,2005-12-26
4,823519,1,3.0,2004-05-03
5,893988,1,3.0,2005-11-17
6,124105,1,4.0,2004-08-05
7,1248029,1,3.0,2004-04-22
8,1842128,1,4.0,2004-05-09
9,2238063,1,3.0,2005-05-11


# Credibility

In [17]:
lite_rating_df = pd.DataFrame()

group = df.groupby('Cust_Id')['Rating'].count()
top_users = group.sort_values(ascending=False)[:10000]

group = df.groupby('Movie_Id')['Rating'].count()
top_movies = group.sort_values(ascending=False)[:2000]

lite_rating_df = df.join(top_users, rsuffix='_r', how='inner', on='Cust_Id')
lite_rating_df = lite_rating_df.join(top_movies, rsuffix='_r', how='inner', on='Movie_Id')

# SVD + evaluation

In [18]:
reader = Reader()
data = Dataset.load_from_df(lite_rating_df[['Cust_Id', 'Movie_Id', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=.1)
svd = SVD()
svd.fit(trainset)

predictions = svd.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.9641
MAE:  0.7671


0.767102308313322

# Recommending

In [19]:
!ls

 combined_data_1.txt  'kaggle (1).json'   netflix-prize-data.zip   sample_data
 combined_data_2.txt  'kaggle (2).json'   probe.txt
 combined_data_3.txt   kaggle.json	  qualifying.txt
 combined_data_4.txt   movie_titles.csv   README


In [20]:
df_title = pd.read_csv('movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title.head(10)

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
6,7,1992.0,8 Man
7,8,2004.0,What the #$*! Do We Know!?
8,9,1991.0,Class of Nuke 'Em High 2
9,10,2001.0,Fighter


In [21]:
titles = df_title.copy()
titles['Estimate_Score'] = titles['Movie_Id'].apply(lambda x: svd.predict(3078, x).est)
titles = titles.sort_values(by=['Estimate_Score'], ascending=False)
titles.head(10)

Unnamed: 0,Movie_Id,Year,Name,Estimate_Score
12,13,2003.0,Lord of the Rings: The Return of the King: Ext...,4.281787
9235,9236,1998.0,South Park: Season 2,4.184548
4508,4509,1977.0,Little House on the Prairie: Season 4,3.989838
4520,4521,2002.0,Wire in the Blood: Justice Painted Blind,3.987855
13379,13380,1949.0,Stray Dog,3.882851
4505,4506,1961.0,Breakfast at Tiffany's,3.870972
13377,13378,1940.0,His Girl Friday,3.827609
13376,13377,1963.0,Winter Light,3.82029
24,25,1997.0,Inspector Morse 31: Death Is Now My Neighbour,3.813174
13373,13374,1933.0,Dinner at Eight,3.783715
