# Import Libraries

In [219]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
#Import svm model
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

# Load Dataset

In [220]:
# read the rating csv file
rating_file=pd.read_csv('/content/file.tsv' , sep='\t' , names=['user_id', 'movieId', 'rating', 'timestamp'])
rating_file

Unnamed: 0,user_id,movieId,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742
...,...,...,...,...
99998,880,476,3,880175444
99999,716,204,5,879795543
100000,276,1090,1,874795795
100001,13,225,2,882399156


In [221]:
# read the movie csv files
title_file=pd.read_csv('/content/movies.csv')
print(title_file)

       movieId                                title  \
0            1                     Toy Story (1995)   
1            2                       Jumanji (1995)   
2            3              Grumpier Old Men (1995)   
3            4             Waiting to Exhale (1995)   
4            5   Father of the Bride Part II (1995)   
...        ...                                  ...   
10324   146684        Cosmic Scrat-tastrophe (2015)   
10325   146878           Le Grand Restaurant (1966)   
10326   148238       A Very Murray Christmas (2015)   
10327   148626                 The Big Short (2015)   
10328   149532  Marco Polo: One Hundred Eyes (2015)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                  

In [222]:
# merge read the csv file
movie_dataset=pd.merge(title_file , rating_file , on='movieId')

In [223]:
# first five rows read
movie_dataset.head()

Unnamed: 0,movieId,title,genres,user_id,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,308,4,887736532
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,287,5,875334088
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,148,4,877019411
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,280,4,891700426
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,66,3,883601324


In [224]:
# shape
movie_dataset.shape

(84243, 6)

# Data Preprocessing

In [225]:
# identify the null values
movie_dataset.isnull().sum()

movieId      0
title        0
genres       0
user_id      0
rating       0
timestamp    0
dtype: int64

In [226]:
# info
movie_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84243 entries, 0 to 84242
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movieId    84243 non-null  int64 
 1   title      84243 non-null  object
 2   genres     84243 non-null  object
 3   user_id    84243 non-null  int64 
 4   rating     84243 non-null  int64 
 5   timestamp  84243 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 4.5+ MB


In [227]:
# replace the no gerra listed into null values
for x in movie_dataset['genres']:
  if x == '(no genres listed)' :
    movie_dataset['genres']=movie_dataset['genres'].replace(x , np.nan)

print(movie_dataset)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            1                    Toy Story (1995)   
2            1                    Toy Story (1995)   
3            1                    Toy Story (1995)   
4            1                    Toy Story (1995)   
...        ...                                 ...   
84238     1676            Starship Troopers (1997)   
84239     1678           Joy Luck Club, The (1993)   
84240     1680                Sliding Doors (1998)   
84241     1681  Mortal Kombat: Annihilation (1997)   
84242     1682             Truman Show, The (1998)   

                                            genres  user_id  rating  timestamp  
0      Adventure|Animation|Children|Comedy|Fantasy      308       4  887736532  
1      Adventure|Animation|Children|Comedy|Fantasy      287       5  875334088  
2      Adventure|Animation|Children|Comedy|Fantasy      148       4  877019411  
3      Adventure|Animation|

In [228]:
# drop the null values
movie_dataset=movie_dataset.dropna()
movie_dataset.shape

(84243, 6)

In [229]:
# again identify the null values
movie_dataset.isnull().sum()

movieId      0
title        0
genres       0
user_id      0
rating       0
timestamp    0
dtype: int64

In [230]:
# drops the colmns
movie_dataset=movie_dataset.drop(columns=['movieId' , 'user_id'] , axis=1)

In [231]:
movie_dataset

Unnamed: 0,title,genres,rating,timestamp
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,887736532
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,875334088
2,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,877019411
3,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,891700426
4,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,883601324
...,...,...,...,...
84238,Starship Troopers (1997),Action|Sci-Fi,2,875731674
84239,"Joy Luck Club, The (1993)",Drama|Romance,1,889289570
84240,Sliding Doors (1998),Drama|Romance,2,889289570
84241,Mortal Kombat: Annihilation (1997),Action|Adventure|Fantasy,3,887160722


In [232]:
# label encoding
movie_dataset['genres']=le.fit_transform(movie_dataset['genres'])
movie_dataset['title']=le.fit_transform(movie_dataset['title'])

In [233]:
# split the dataset into target and trainig
x=movie_dataset.drop(columns=['rating'] , axis=1)
y=movie_dataset['rating']

In [234]:
x=np.array(x)
y=np.array(y)

In [235]:
# splitting the dataset into testing and training
xtrain , xtest , ytrain , ytest=train_test_split(x , y , test_size=0.25)

In [236]:
# shape
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(63182, 3)
(21061, 3)
(63182,)
(21061,)


# Model Building

In [237]:
#Train the model using the training sets
lr.fit(xtrain, ytrain)

In [238]:
# score
score=lr.score(xtrain, ytrain)
print(score)

0.34468677787977586


In [239]:
#Predict the response for test dataset
y_pred = lr.predict(xtest)
print(y_pred)

[4 4 4 ... 4 4 4]


In [240]:
# means absolute error
mae=np.abs(ytest , y_pred)
print(mae)

[2 3 5 ... 5 4 4]
