In [141]:
import  numpy as np  
import pandas as pd  
import seaborn as sns  
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder  
from sklearn.svm import SVR  
from sklearn import metrics  

In [142]:
df=pd.read_csv('Movies.csv', encoding='latin-1')  # reading dataset from CSV file

In [143]:
df.head()  # display first few rows of dataset

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [144]:
df.shape  #returns the number of rows and columns in the DataFrame


(15509, 10)

In [145]:
df.info()  # show dataset info (columns, datatypes, nulls)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [146]:
df.isna().sum() # counts how many missing values each column has

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [147]:
df = df.drop(columns=['Duration'], axis=1)  # remove the 'Duration' column from the dataframe

In [148]:
df.shape  #returns the number of rows and columns in the DataFrame

(15509, 9)

In [149]:
df.describe()  # statistical summary of dataset

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [150]:
print(df.columns)  

Index(['Name', 'Year', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1',
       'Actor 2', 'Actor 3'],
      dtype='object')


In [151]:
df.fillna(method='ffill', inplace=True) # fill missing values by forward filling

In [152]:
df['Genre']=pd.to_numeric(df['Genre'],errors='coerce') # convert 'Genre' column to numeric; invalid values become NaN


In [153]:
df['Actor 1']=pd.to_numeric(df['Actor 1'],errors='coerce')  # convert 'Actor 1' column to numeric; invalid values become NaN


In [154]:
df['Actor 2']=pd.to_numeric(df['Actor 2'],errors='coerce') # convert 'Actor 2' column to numeric; invalid values become NaN


In [155]:
df['Actor 3']=pd.to_numeric(df['Actor 3'],errors='coerce') # convert 'Actor 3' column to numeric; invalid values become NaN


In [156]:
df['Director']=pd.to_numeric(df['Director'],errors='coerce')  # convert 'Actor 3' column to numeric; invalid values become NaN


In [157]:
label_encoders = {} # create an empty dictionary to store LabelEncoders for each column
for column in ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]:  
    label_encoders[column] = LabelEncoder()  # initialize a LabelEncoder for the column and store it in the dictionary
    df[column] = label_encoders[column].fit_transform(df[column])   # encode categorical text values in the column into numeric labels

In [158]:

features = ['Genre','Actor 1','Actor 2','Actor 3', 'Director']  # list of feature columns to use for prediction
X = df[features]  # select the feature columns from the dataframe
Y = df['Rating']  # select the target column (Rating) to predict

In [159]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)  # splitting dataset into training and testing sets

In [160]:
model = SVR(kernel="linear")  # create a Support Vector Regression model with a linear kernel
model.fit(X_train, Y_train)  # train the model on training data

In [161]:

X_test_no_nan = X_test.dropna()  # remove rows with missing values from test features
Y_test_no_nan = Y_test.dropna()  # remove rows with missing values from test target

In [162]:
pred = model.predict(X_test_no_nan)  # make predictions using trained model

In [163]:

min_samples = min(len(Y_test_no_nan), len(pred)) # find the smaller length between actual and predicted values to match sizes
Y_test_no_nan = Y_test_no_nan[:min_samples]  # trim actual target values to match the size of predictions
pred = pred[:min_samples]  # trim predicted values to match the size of actual target values


In [164]:

mse = metrics.mean_squared_error(Y_test_no_nan, pred)  # calculate Mean Squared Error between actual and predicted values
rmse = np.sqrt(mse)  # calculate Root Mean Squared Error
mae = metrics.mean_absolute_error(Y_test_no_nan, pred)  # calculate Mean Absolute Error between actual and predicted values
msle = metrics.mean_squared_log_error(Y_test_no_nan, pred)  # calculate Mean Squared Logarithmic Error for actual vs predicted values

print("Mean Squared Error:", mse)  
print("Root Mean Squared Error:", rmse)  
print("Mean Absolute Error:", mae)  
print("Mean Squared Logarithmic Error:", msle)  


Mean Squared Error: 1.9479107557389734
Root Mean Squared Error: 1.3956757344523023
Mean Absolute Error: 1.1172814031467628
Mean Squared Logarithmic Error: 0.05223232272222806
