In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, precision_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from numpy import set_printoptions
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from scipy.stats import norm
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

In [3]:
ratings = pd.read_csv('/content/drive/MyDrive/Recommender/dataset/ml-1m/ratings.csv')
print(ratings.shape)
ratings.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies = pd.read_csv('/content/drive/MyDrive/Recommender/dataset/ml-1m/movies.csv')
print(movies.shape)
movies.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movie_dataset = movies[['movieId','title']]
movie_dataset.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [6]:
users = pd.read_csv('/content/drive/MyDrive/Recommender/dataset/ml-1m/users.csv')
print(users.shape)
users.head()

(3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
merged_dataset = pd.merge(ratings, movie_dataset, how='inner', on='movieId')
merged_dataset.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,5,1,4.0,847434962,Toy Story (1995)
2,7,1,4.5,1106635946,Toy Story (1995)
3,15,1,2.5,1510577970,Toy Story (1995)
4,17,1,4.5,1305696483,Toy Story (1995)


In [8]:
# displaying the datatypes
display(merged_dataset.dtypes)

# converting 'Weight' from float to int
merged_dataset['rating'] = merged_dataset['rating'].astype(int)

# displaying the datatypes
display(merged_dataset.dtypes)


userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
dtype: object

userId        int64
movieId       int64
rating        int64
timestamp     int64
title        object
dtype: object

In [9]:
merged_dataset.shape

(100836, 5)

# ایجاد یک مجموعه داده نهایی تصفیه شده با شناسه کاربری منحصر به فرد، ترکیب نام فیلم و رتبه بندی آنها:

In [10]:
refined_dataset = merged_dataset.groupby(by=['userId','title'], as_index=False).agg({"rating":"mean"})

refined_dataset.head()

Unnamed: 0,userId,title,rating
0,1,"13th Warrior, The (1999)",4.0
1,1,20 Dates (1998),4.0
2,1,"Abyss, The (1989)",4.0
3,1,"Adventures of Robin Hood, The (1938)",5.0
4,1,Alice in Wonderland (1951),5.0


In [12]:
from sklearn.preprocessing import LabelEncoder
user_enc = LabelEncoder()
refined_dataset['user'] = user_enc.fit_transform(refined_dataset['userId'].values)
n_users = refined_dataset['user'].nunique()

In [14]:
item_enc = LabelEncoder()
refined_dataset['movie'] = item_enc.fit_transform(refined_dataset['title'].values)
n_movies = refined_dataset['movie'].nunique()

In [16]:
refined_dataset['rating'] = refined_dataset['rating'].values.astype(np.float32)
min_rating = min(refined_dataset['rating'])
max_rating = max(refined_dataset['rating'])
n_users, n_movies, min_rating, max_rating

(610, 9719, 0.0, 5.0)

In [17]:
refined_dataset.head()

Unnamed: 0,userId,title,rating,user,movie
0,1,"13th Warrior, The (1999)",4.0,0,48
1,1,20 Dates (1998),4.0,0,66
2,1,"Abyss, The (1989)",4.0,0,202
3,1,"Adventures of Robin Hood, The (1938)",5.0,0,245
4,1,Alice in Wonderland (1951),5.0,0,325


In [18]:
print(refined_dataset.info())
print(refined_dataset.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100832 entries, 0 to 100831
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userId  100832 non-null  int64  
 1   title   100832 non-null  object 
 2   rating  100832 non-null  float32
 3   user    100832 non-null  int64  
 4   movie   100832 non-null  int64  
dtypes: float32(1), int64(3), object(1)
memory usage: 3.5+ MB
None
(100832, 5)


## Splitting the data into training and testing

In [20]:
from sklearn.model_selection import train_test_split
X = refined_dataset[['user', 'movie']].values
y = refined_dataset['rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=50)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((90748, 2), (10084, 2), (90748,), (10084,))

In [22]:
from sklearn.naive_bayes import MultinomialNB
NV = MultinomialNB(alpha=1.0, fit_prior=True)
NV.fit(X_train, y_train)
y_pred_nv = NV.predict(X_test)
print(y_pred_nv[:20])
accuracy = NV.score(X_test, y_test)
print(f'The accuracy is: {accuracy*100 :.1f}%')


[4. 5. 1. 5. 1. 1. 1. 1. 5. 5. 1. 2. 5. 1. 1. 1. 1. 5. 1. 5.]
The accuracy is: 73.1%
