In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


In [2]:
movie_data = pd.read_csv('IMDb Movies India.csv', encoding='iso-8859-1')

In [3]:
movie_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
movie_data.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [5]:
# Check for missing values
missing_values = movie_data.isnull()

In [6]:
movie_data['Rating'].fillna(movie_data['Rating'].mean(), inplace=True)

In [7]:
# Drop a column with many missing values
movie_data.drop(columns=['Votes'], inplace=True)
movie_data.drop(columns=['Duration'], inplace=True)

# Drop rows with any missing values
movie_data.dropna(axis=0, inplace=True)

In [8]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Encode the 'Director' column
movie_data['Director'] = label_encoder.fit_transform(movie_data['Director'])

# Encode the 'Actor1' column
movie_data['Actor 1'] = label_encoder.fit_transform(movie_data['Actor 1'])
# Encode the 'Actor2' column
movie_data['Actor 2'] = label_encoder.fit_transform(movie_data['Actor 2'])
# Encode the 'Actor3' column
movie_data['Actor 3'] = label_encoder.fit_transform(movie_data['Actor 3'])

In [9]:
# Feature selection and preprocessing (example)
X = movie_data[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
X = pd.get_dummies(X, drop_first=True)  # One-hot encoding for categorical variables
y = movie_data['Rating']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create a SimpleImputer instance
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the target variable and transform it
y_train_imputed = imputer.fit_transform(y_train.values.reshape(-1, 1))

# Convert the imputed values back to a 1D array
y_train_imputed = y_train_imputed.ravel()

In [11]:
# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=5)
model.fit(X_train, y_train_imputed)

In [12]:
# Make predictions
y_pred = model.predict(X_test)

In [13]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.2081994492076134
R-squared: 0.06433708230855562
