In [1]:
pip install pandas numpy scikit-learn matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
#Step 1 : Import necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score,accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Step 2 : Load the Dataset
file_path = "C://Users//shrav//Desktop//CodeSoft//archive (5)//IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
#Step 3 : Data Exploration and Preprocessing
#checking for missing values
x = df.isnull().sum()
print(x)

#handling the missing values

#Name --> no missing values
#Step 3.1 : Year --> 528 missing values --> fill with most common year (median)
#obstacle --> some values are written as strings instead od integer
df['Year'] = df['Year'].astype(str) #first convert Year column to string
df['Year'] = df['Year'].str.extract(r'(\d+)') #extract digits
df['Year'] = pd.to_numeric(df['Year'], errors='coerce') # Convert to numeric, coercing errors to NaN
df["Year"] = df["Year"].fillna(df["Year"].median())

#Step 3.2 : Duration --> 8269 missing values --> fill with most common value of duration i.e.median
#obstacle --> some values are like "30 min",etc i.e. in string format
df['Duration'] = df['Duration'].str.replace(' min', '', regex=False)  # Remove " min" text
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')  # Convert to numeric, coercing errors to NaN
df["Duration"] = df["Duration"].fillna(df["Duration"].median())

#Step 3.3 : Genre --> 1877 missing values --> fill with most frequently genre i.e.mode
df["Genre"] = df["Genre"].fillna(df["Genre"].mode()[0])

#Step 3.4 : Rating --> 7590 missing values --> fill with median rating i.e.median
df["Rating"] = df["Rating"].fillna(df["Rating"].median())

#Step 3.5 : Votes --> 7589 missing values --> fill with median number of votes i.e.median
#obstacle --> some values are written as strings instead of integer like nana '8',etc
df['Votes'] = df['Votes'].astype(str) #first convert Votes column to string
df['Votes'] = df['Votes'].str.extract(r'(\d+)') #extract digits
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce') # Convert to numeric, coercing errors to NaN
df["Votes"] = df["Votes"].fillna(df["Votes"].median())

#Step 3.6 : Director --> 525 missing values --> fill with "unknown"
df["Director"] = df["Director"].fillna("unknown")

#Step 3.7 : Actor 1, Actor 2, Actor 3 --> 1617, 2384, 3144 missing values --> fill with "unknown"
df["Actor 1"] = df["Actor 1"].fillna("unknown")
df["Actor 2"] = df["Actor 2"].fillna("unknown")
df["Actor 3"] = df["Actor 3"].fillna("unknown")

print("Status of the dataset after handling the missing values")
print(df.isnull().sum())

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64
Status of the dataset after handling the missing values
Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64


In [5]:
#Step 4 : Split the data into into features(x) and target(y)
#split the features into numerical and categorical features
numerical_features = ['Year','Duration','Votes']
categorical_features = ['Genre','Director','Actor 1','Actor 2','Actor 3']

#define transformers for numerical and catagorical features
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')), #impute missing value with median value
    ('scaler',StandardScaler()) #Scale numeric values
])

categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='unknown')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

In [6]:
#Step 5 : Combine both transformer into a column transformer
preprocessor = ColumnTransformer( 
    transformers = [
        ('num',numerical_transformer, numerical_features),
        ('cat',categorical_transformer, categorical_features)
    ])

In [7]:
#Step 6 : Spliting the data into training and testing sets (80% and 20%)
df_clean = df.dropna(subset=['Rating'])

# Split features (X) and target (y)
X = df_clean.drop('Rating', axis=1)
y = df_clean['Rating']

# Verify that the number of rows in X and y match
print(X.shape)  # Should match the number of rows in y
print(y.shape)  # Should be the same as X

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

model = Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('regression', RandomForestRegressor(n_estimators=100,random_state = 42))
])

(15509, 9)
(15509,)
(12407, 9) (12407,)
(3102, 9) (3102,)


In [8]:
#Step 7 : Train the model
model.fit(X_train,y_train)

In [9]:
#Step 8 : Make Predictions
y_pred = model.predict(X_test)

In [10]:
#Step 9 : Evaluate the model
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)
print(f"Mean Absolute Error : {mae}")
print(f"Mean Squared Error : {mse}")
print(f"R^2 Score : {r2}")

Mean Absolute Error : 0.44247807865892974
Mean Squared Error : 0.6902980705996131
R^2 Score : 0.2901160517280247


In [11]:
#Evaluate the model based on it's accuracy score
# Ensure y_test and y_pred are integers
y_test = y_test.astype(int)
y_pred = y_pred.astype(int)

accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy of Model : {accuracy*100} %")

Accuracy of Model : 63.60412637008381 %
