# Importing necessary packages


In [93]:
# For data manipulation
import pandas as pd
import numpy as np

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.svm import SVR
from sklearn import tree


# Loading the data set using the pandas dataframe

The data can be obtained in the csv format available at the following URL 

http://bit.ly/imdbratings

In [78]:
# reading the csv file

movies=pd.read_csv('http://bit.ly/imdbratings')

In [79]:
# Taking a peek over the first five rows of our dataset

movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [80]:
# Lets look at the column names of this dataframe

movies.columns


Index(['star_rating', 'title', 'content_rating', 'genre', 'duration',
       'actors_list'],
      dtype='object')

In [81]:
# lets look whether there are any missing values in our data

movies.isnull().sum()

star_rating       0
title             0
content_rating    3
genre             0
duration          0
actors_list       0
dtype: int64

We observe that the content rating field has 3 NaN (missing values)

In [82]:
# Lets access row number the missing values

content_rating_null_values=list(movies.content_rating.isnull())

for i in range(len(content_rating_null_values)):
    if content_rating_null_values[i]==True:
        print(i)

187
649
936


I looked up for the specific movies from other internet sources and assigned the specific ratings to these missing values

In [83]:
# filling these missing values at these specific row indicies


movies.iloc[187,2]='PG13'
movies.iloc[649,2]='PG'
movies.iloc[936,2]='PG13'

# Feature Engineering and Preprocessing

We drop the features title and actors_list for now because we are more interested in the other features

In [84]:
movies.drop(['title'],axis=1,inplace=True)
movies.drop(['actors_list'],axis=1,inplace=True)


We now create a list of those columns which have categorical values i.e object datatypes

In [85]:
# list of categorical features

categorical_features=[i for i in movies.select_dtypes(include=np.object) ]


We create a new dataframe only add the features we want

In [86]:
dummy_df=pd.DataFrame()

In [87]:
# adding the duration column to the dummy_df
dummy_df['duration']=movies.duration

Next we one hot encode the categorical features to prepare the dataset for the Machine Learning models

In [88]:
# One hot encoding  

for feature in categorical_features:
    df=pd.get_dummies(movies[feature])

In [89]:

# Concatenating the encoded data frame with dummy_df
train_df=pd.concat([df,dummy_df],axis=1)

In [90]:
# Our data is prepared

train_df.head()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Mystery,Sci-Fi,Thriller,Western,duration
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,142
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,175
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,200
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,152
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,154


In [91]:
# We also include the column which is target variable in this problem

train_df=pd.concat([train_df,movies['star_rating']],axis=1)


In [92]:
# Shape of our prepared dataset
train_df.shape

(979, 18)

In [96]:
# Preparing the input and output arrays

x=train_df.drop(['star_rating'],axis=1)
y=train_df['star_rating']

In [97]:
# Splitting the data for training and testing

X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)

# Linear Regression

In [98]:
# linear regression
LR=LinearRegression()


In [99]:
# fitting the model
LR.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [100]:
# prediction on the testing data
y_pred=LR.predict(X_test)

In [101]:
# Evaluating the model by calculating the RMSE
print('RMSE using Linear regression is' ,metrics.mean_squared_error(y_test, y_pred,sample_weight=None))

RMSE using Linear regression is 0.09639808803214592


# SVM

In [102]:
sv=SVR()

In [103]:
# fitting the model
sv.fit(X_train,y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [104]:
sv_pred=sv.predict(X_test)


In [114]:
print('RMSE using SVR is' ,metrics.mean_squared_error(y_test, sv_pred,sample_weight=None))

RMSE using SVR is 0.12159429508846027


# Decision Tree

In [105]:
clf = tree.DecisionTreeRegressor()

In [106]:
clf.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [107]:
DT_pred=clf.predict(X_test)

In [108]:
print('RMSE using DT is' ,metrics.mean_squared_error(y_test, DT_pred,sample_weight=None))

RMSE using DT is 0.19240223922902494


It can be observed the Linear regression model is better performing in this case compared to the others