In [None]:
# importing required libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score

In [None]:
# Load the MovieLens dataset
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Check the first few rows of data
movies.head()

In [None]:
movies.isnull().sum() # checking if any null present

In [None]:
ratings.head()

In [None]:
ratings.isnull().sum() #checking if any nulls present

### Merge both dataframes on movieId

In [None]:
final_df = pd.merge(movies,ratings,on='movieId')
final_df.head()

In [None]:
final_df.shape

In [None]:
# drop timestamp as not required for analysis
data= final_df.drop(['timestamp'], axis=1)
data.head()

In [None]:
data.shape

In [None]:
data.describe(include='all') # summary statistics of data

In [None]:
data.info()

In [None]:
data.userId.nunique

### Visual Representation of data

In [None]:
# Ratings as per movie 'Toy Story'
plt.figure(figsize=(8,6))
movie_grp=data.groupby('title')
toy_story = movie_grp.get_group('Toy Story (1995)')
toy_story['rating'].hist()
plt.title('User rating of the movie "Toy Story"')
plt.xlabel('rating')
plt.ylabel('number of users')
plt.show()

In [None]:
# Ratings as per Genre 'Drama'
plt.figure(figsize=(8,6))
movie_grp=data.groupby('genres')
genre_grp = movie_grp.get_group('Drama')
genre_grp['rating'].hist()
plt.title('User rating as per genre "Drama"')
plt.xlabel('rating')
plt.ylabel('number of users')
plt.show()

In [None]:
# Ratings as per Genre 'Comedy'
plt.figure(figsize=(8,6))
movie_grp=data.groupby('genres')
genre_grp = movie_grp.get_group('Comedy')
genre_grp['rating'].hist()
plt.title('User rating as per genre "Comedy"')
plt.xlabel('rating')
plt.ylabel('number of users')
plt.show()

In [None]:
rating_avg=data.groupby('title')['rating'].mean()
rating_avg.head()

In [None]:
#sorting ratings in descending order
rating_avg = rating_avg.sort_values(ascending=False)
rating_avg.head()

### Rating count/number of ratings for the movies

In [None]:
rating_count=data.groupby('title')['rating']
rating_count=rating_count.count().sort_values(ascending=False)
rating_count[:25]

In [None]:
rating_avg_count=pd.DataFrame(data=rating_avg)
rating_avg_count['number_of_ratings']=pd.DataFrame(rating_count)
rating_avg_count.head()

In [None]:
rating_avg_count.describe()

### Top 25 movies 
### ratings excluding movies with less than 10 ratings

In [None]:
filter_data=rating_avg_count[rating_avg_count['number_of_ratings'] > 10]
filter_data[:25]

### Ratings for all movies reviewed by userId 475

In [None]:
user_475=data[data['userId']==475]
user_475

### Feature Engineering

In [None]:
data['genres'].value_counts().head()

In [None]:


data['genres'].unique()

### Encoding genres with one hot encoding

In [None]:
genre=data['genres'].str.get_dummies().add_prefix('Genre_')
genre.head()

In [None]:
final_data=pd.concat([data,genre],axis=1)
final_data.head()

In [None]:
# Drop the genres column from final_data
final_data = final_data.drop('genres',axis=1)
final_data.head()

In [None]:
final_data.columns

In [None]:
final_data.dtypes

In [None]:
final_data['rating']=final_data['rating'].astype(int)
final_data['rating']

In [None]:
final_data.dtypes

In [None]:
moviemat = final_data.pivot_table(index='userId',columns='title',values='rating')
moviemat.head()

In [None]:
filter_data.sort_values('number_of_ratings',ascending=False).head(5)

In [None]:
#analysing correlation with similar movies
PulpFiction_ratings=moviemat['Pulp Fiction (1994)']
SilenceLambs_ratings=moviemat['Silence of the Lambs, The (1991)']

In [None]:
PulpFiction_ratings.head()

In [None]:
SilenceLambs_ratings.head()

In [None]:
# Analysing correlation with similar movies
similar_to_Pulpfiction = moviemat.corrwith(PulpFiction_ratings)
similar_to_Silencelambs = moviemat.corrwith(SilenceLambs_ratings)

In [None]:
similar_to_Pulpfiction.head()

In [None]:
similar_to_Silencelambs.head()

In [None]:
corr_Pulpfiction=pd.DataFrame(similar_to_Pulpfiction,columns=['Correlation'])
corr_Pulpfiction.dropna(inplace=True)
corr_Pulpfiction.head()

In [None]:

### Separating X and Y variables
X = final_data.drop(['rating','title','movieId','userId'], axis=1)
X

In [None]:
y=final_data['rating']
y

In [None]:
X.shape,y.shape

In [None]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

### Linear Regression

In [None]:
linreg=LinearRegression()
linreg.fit(X_train,y_train)

In [None]:
y_pred=linreg.predict(X_test)
y_pred

In [None]:
#Evaluation Metrics
r2_score(y_test,y_pred)

In [None]:
pred_df = pd.DataFrame({'Test':y_test,'Prediction':y_pred})
pred_df.head()