In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [30]:
df = pd.read_csv('movies.csv')
df.shape

(7668, 15)

In [31]:
df.head(10)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0
5,Friday the 13th,R,Horror,1980,"May 9, 1980 (United States)",6.4,123000.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,United States,550000.0,39754601.0,Paramount Pictures,95.0
6,The Blues Brothers,R,Action,1980,"June 20, 1980 (United States)",7.9,188000.0,John Landis,Dan Aykroyd,John Belushi,United States,27000000.0,115229890.0,Universal Pictures,133.0
7,Raging Bull,R,Biography,1980,"December 19, 1980 (United States)",8.2,330000.0,Martin Scorsese,Jake LaMotta,Robert De Niro,United States,18000000.0,23402427.0,Chartoff-Winkler Productions,129.0
8,Superman II,PG,Action,1980,"June 19, 1981 (United States)",6.8,101000.0,Richard Lester,Jerry Siegel,Gene Hackman,United States,54000000.0,108185706.0,Dovemead Films,127.0
9,The Long Riders,R,Biography,1980,"May 16, 1980 (United States)",7.0,10000.0,Walter Hill,Bill Bryden,David Carradine,United States,10000000.0,15795189.0,United Artists,100.0


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


In [33]:
df.isnull().sum()

name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64

In [34]:
model_data = df.drop(['name','year','released','budget','gross'],axis=1)

In [35]:
model_data.shape

(7668, 10)

In [36]:
model_data.isnull().sum()

rating      77
genre        0
score        3
votes        3
director     0
writer       3
star         1
country      3
company     17
runtime      4
dtype: int64

In [37]:
model_data.dropna(inplace = True)

In [38]:
model_data.shape

(7574, 10)

In [39]:
label_encoder = LabelEncoder()

In [40]:
columns = ['rating','genre','director','writer','star','country','company']

for col in columns:
    model_data[col] = label_encoder.fit_transform(model_data[col])

In [41]:
model_data.head(10)

Unnamed: 0,rating,genre,score,votes,director,writer,star,country,company,runtime
0,6,6,8.4,927000.0,2544,3958,1022,54,2273,146.0
1,6,1,5.8,65000.0,2233,1612,316,55,710,104.0
2,4,0,8.7,1200000.0,1093,2534,1708,55,1505,124.0
3,4,4,7.7,221000.0,1279,1975,2197,55,1769,88.0
4,6,4,7.3,108000.0,1037,511,398,55,1736,98.0
5,6,9,6.4,123000.0,2486,4304,234,55,1769,95.0
6,6,0,7.9,188000.0,1390,821,1250,55,2236,133.0
7,6,3,8.2,330000.0,1799,1747,2191,55,612,129.0
8,4,0,6.8,101000.0,2267,1955,855,55,861,127.0
9,6,3,7.0,10000.0,2816,413,572,55,2227,100.0


In [42]:
model = LinearRegression()

In [43]:
X = model_data.drop('rating',axis=1)
Y = model_data['rating']

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [45]:
model.fit(X_train, Y_train)

In [46]:
Y_pred = model.predict(X_test)

In [47]:
print('Mean abolute error is:', mean_absolute_error(Y_test,Y_pred))
print('Mean squared error is:', mean_squared_error(Y_test,Y_pred))
print('Root mean squared error is:', np.sqrt(mean_squared_error(Y_test,Y_pred)))

Mean abolute error is: 0.8494451251706017
Mean squared error is: 1.2876486737862354
Root mean squared error is: 1.1347460833976186
