Data Loading and Exploration

In [4]:
import pandas as pd

# Load the dataset, specifying the encoding
movie_data = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin-1') # Try 'latin-1' encoding. If it doesn't work, experiment with other encodings like 'iso-8859-1'.

# Display the first few rows of the dataset
print(movie_data.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

Data Preprocessing

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Handle missing values (example: filling missing values with mode)
movie_data.fillna(movie_data.mode().iloc[0], inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
movie_data['Genre'] = label_encoder.fit_transform(movie_data['Genre'])
movie_data['Director'] = label_encoder.fit_transform(movie_data['Director'])

# Assuming you want to encode 'Actor 1', 'Actor 2', and 'Actor 3' separately:
movie_data['Actor 1'] = label_encoder.fit_transform(movie_data['Actor 1'])
movie_data['Actor 2'] = label_encoder.fit_transform(movie_data['Actor 2'])
movie_data['Actor 3'] = label_encoder.fit_transform(movie_data['Actor 3'])

# Select features and target variable
# Include all actor columns in the features
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
X = movie_data[features]  # Select the features using a list of column names
y = movie_data['Rating']

In [19]:
# Extract year from release date if available
# Remove parentheses and convert to numeric
movie_data['Year'] = movie_data['Year'].str.replace(r'[()]', '', regex=True).astype(float)
movie_data['release_year'] = pd.to_datetime(movie_data['Year'], format='%Y').dt.year

In [22]:
# Select features and target variable
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'release_year']
X = movie_data[features]
y = movie_data['Rating']


Model Training

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


Model Evaluation

In [24]:
from sklearn.metrics import mean_squared_error, r2_score

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# R-squared Score
r2 = r2_score(y_test, y_pred)
print(f'R-squared Score: {r2}')


Mean Squared Error: 0.8796762509468218
R-squared Score: 0.12284186932722085
