# Data Loading

In [3]:
# Read the data
import pandas as pd
df = pd.read_csv('/content/IMDb Movies India.csv', encoding='ISO-8859-1')

# Remove rows with NaN values
df.dropna(inplace=True)

df.head()


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


#Feature Generation

## Feature 1: Genre Rating

In [4]:
# Ensure 'Rating' is numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')


# Split genres into individual rows
genres_df = df.assign(Genre=df['Genre'].str.split(',')).explode('Genre')

# Calculate average rating for each genre
genre_popularity = genres_df.groupby('Genre')['Rating'].mean().reset_index()
genre_popularity.columns = ['Genre', 'GenrePopularity']

# Map genre popularity back to the original dataset
def map_genre_popularity(genres):
    # Split genres and calculate average popularity
    genres_list = genres.split(',')
    # Remove leading/trailing spaces from each genre
    genres_list = [genre.strip() for genre in genres_list]
    # Filter genre popularity for these genres
    relevant_popularity = genre_popularity.loc[genre_popularity['Genre'].isin(genres_list)]

    # If no genres match, return NaN
    if relevant_popularity.empty:
        return None

    # Calculate average popularity
    avg_popularity = relevant_popularity['GenrePopularity'].mean()
    return avg_popularity

# Apply the function to each row
df['GenrePopularity'] = df['Genre'].apply(map_genre_popularity)



In [5]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,GenrePopularity
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.248697
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.718582
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,6.060892
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,5.393591
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,5.291441
...,...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,(2015),115 min,Drama,6.1,408,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana,6.248697
15494,Zubeidaa,(2001),153 min,"Biography, Drama, History",6.2,1496,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee,6.723772
15503,Zulm Ki Zanjeer,(1989),125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,5.961802
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,5.880341


## Feature 2: Actor Popularity

In [6]:

# Ensure 'Rating' is numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Feature: Actor Popularity# Create a DataFrame with actor-rating pairs
actor_ratings = pd.concat([
    df[['Actor 1', 'Rating']].rename(columns={'Actor 1': 'Actor'}),
    df[['Actor 2', 'Rating']].rename(columns={'Actor 2': 'Actor'}),
    df[['Actor 3', 'Rating']].rename(columns={'Actor 3': 'Actor'})
], ignore_index=True)

# Remove rows with missing actor names
actor_ratings = actor_ratings.dropna(subset=['Actor'])

# Calculate actor popularity
actor_popularity = actor_ratings.groupby('Actor')['Rating'].mean().reset_index()
actor_popularity.columns = ['Actor', 'ActorPopularity']

# Merge with original dataset for each actor column
for actor_col in ['Actor 1', 'Actor 2', 'Actor 3']:
    actor_popularity_map = actor_popularity.set_index('Actor')['ActorPopularity']
    df[f'{actor_col}Popularity'] = df[actor_col].map(actor_popularity_map)



In [7]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,GenrePopularity,Actor 1Popularity,Actor 2Popularity,Actor 3Popularity
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.248697,6.566667,7.000000,7.000000
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.718582,5.666667,4.400000,4.450000
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,6.060892,4.856757,5.948148,6.646296
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,5.393591,5.723810,5.566667,6.516667
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,5.291441,5.600000,5.729412,5.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,(2015),115 min,Drama,6.1,408,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana,6.248697,7.150000,6.000000,5.850000
15494,Zubeidaa,(2001),153 min,"Biography, Drama, History",6.2,1496,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee,6.723772,4.566000,5.821739,6.346809
15503,Zulm Ki Zanjeer,(1989),125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,5.961802,6.333333,6.233333,5.954762
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,5.880341,5.519167,4.925000,5.800000


## Feature 3: Release-year group

In [8]:

# Check the data type of the 'Year' column
print(df['Year'].dtype)

# If 'Year' is not numeric, convert it
if df['Year'].dtype != 'int64':
    df['Year'] = df['Year'].str.replace('(', '').str.replace(')', '').astype(int)

# Feature: Yearly Trend
df['YearGroup'] = pd.cut(df['Year'], bins=[1950, 2000, 2010, 2020], labels=['Pre-2000', '2000-2010', 'Post-2010'], right=False)

# One-hot encode YearGroup
year_groups = pd.get_dummies(df['YearGroup'], prefix='YearGroup').astype(int)

# Merge with original dataset
df = pd.concat([df, year_groups], axis=1)

# Drop the original 'YearGroup' column
df = df.drop(columns=['YearGroup'])



object


In [9]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,GenrePopularity,Actor 1Popularity,Actor 2Popularity,Actor 3Popularity,YearGroup_Pre-2000,YearGroup_2000-2010,YearGroup_Post-2010
1,#Gadhvi (He thought he was Gandhi),2019,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.248697,6.566667,7.000000,7.000000,0,0,1
3,#Yaaram,2019,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.718582,5.666667,4.400000,4.450000,0,0,1
5,...Aur Pyaar Ho Gaya,1997,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,6.060892,4.856757,5.948148,6.646296,1,0,0
6,...Yahaan,2005,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,5.393591,5.723810,5.566667,6.516667,0,1,0
8,?: A Question Mark,2012,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,5.291441,5.600000,5.729412,5.600000,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,2015,115 min,Drama,6.1,408,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana,6.248697,7.150000,6.000000,5.850000,0,0,1
15494,Zubeidaa,2001,153 min,"Biography, Drama, History",6.2,1496,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee,6.723772,4.566000,5.821739,6.346809,0,1,0
15503,Zulm Ki Zanjeer,1989,125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,5.961802,6.333333,6.233333,5.954762,1,0,0
15505,Zulmi,1999,129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,5.880341,5.519167,4.925000,5.800000,1,0,0


## Feature 4: Film Duration Impact

In [10]:
# Convert 'Duration' column to numeric by removing non-numeric characters
df['Duration'] = df['Duration'].str.extract('(\d+)').astype(int)

# Feature: Duration Impact
df['DurationGroup'] = pd.cut(df['Duration'], bins=[0, 90, 120, float('inf')], labels=['Short', 'Medium', 'Long'])

# One-hot encode DurationGroup
duration_groups = pd.get_dummies(df['DurationGroup']).astype(int)

# Merge with original dataset
df = pd.concat([df, duration_groups], axis=1)

# Drop the original 'DurationGroup' column
df = df.drop(columns=['DurationGroup'])



In [11]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,GenrePopularity,Actor 1Popularity,Actor 2Popularity,Actor 3Popularity,YearGroup_Pre-2000,YearGroup_2000-2010,YearGroup_Post-2010,Short,Medium,Long
1,#Gadhvi (He thought he was Gandhi),2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.248697,6.566667,7.000000,7.000000,0,0,1,0,1,0
3,#Yaaram,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.718582,5.666667,4.400000,4.450000,0,0,1,0,1,0
5,...Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,6.060892,4.856757,5.948148,6.646296,1,0,0,0,0,1
6,...Yahaan,2005,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,5.393591,5.723810,5.566667,6.516667,0,1,0,0,0,1
8,?: A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,5.291441,5.600000,5.729412,5.600000,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,2015,115,Drama,6.1,408,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana,6.248697,7.150000,6.000000,5.850000,0,0,1,0,1,0
15494,Zubeidaa,2001,153,"Biography, Drama, History",6.2,1496,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee,6.723772,4.566000,5.821739,6.346809,0,1,0,0,0,1
15503,Zulm Ki Zanjeer,1989,125,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,5.961802,6.333333,6.233333,5.954762,1,0,0,0,0,1
15505,Zulmi,1999,129,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,5.880341,5.519167,4.925000,5.800000,1,0,0,0,0,1


## Feature 5: Vote Count Impact

In [12]:
# Convert 'Votes' column to numeric by removing non-numeric characters
df['Votes'] = df['Votes'].str.replace(',', '').str.extract('(\d+)').astype(float)

# Feature: Vote Count Impact
df['VoteGroup'] = pd.cut(df['Votes'], bins=[0, 100, 1000, float('inf')], labels=['Low', 'Medium', 'High'])

# One-hot encode VoteGroup
vote_groups = pd.get_dummies(df['VoteGroup']).astype(int)

# Merge with original dataset
df = pd.concat([df, vote_groups], axis=1)

# Drop the original 'VoteGroup' column
df = df.drop(columns=['VoteGroup'])


In [13]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,...,Actor 3Popularity,YearGroup_Pre-2000,YearGroup_2000-2010,YearGroup_Post-2010,Short,Medium,Long,Low,Medium.1,High
1,#Gadhvi (He thought he was Gandhi),2019,109,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,...,7.000000,0,0,1,0,1,0,1,0,0
3,#Yaaram,2019,110,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,...,4.450000,0,0,1,0,1,0,1,0,0
5,...Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,...,6.646296,1,0,0,0,0,1,0,1,0
6,...Yahaan,2005,142,"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,...,6.516667,0,1,0,0,0,1,0,0,1
8,?: A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,...,5.600000,0,0,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,2015,115,Drama,6.1,408.0,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana,...,5.850000,0,0,1,0,1,0,0,1,0
15494,Zubeidaa,2001,153,"Biography, Drama, History",6.2,1496.0,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee,...,6.346809,0,1,0,0,0,1,0,0,1
15503,Zulm Ki Zanjeer,1989,125,"Action, Crime, Drama",5.8,44.0,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,...,5.954762,1,0,0,0,0,1,1,0,0
15505,Zulmi,1999,129,"Action, Drama",4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,...,5.800000,1,0,0,0,0,1,0,1,0


## Feature 6: Director Popularity

In [14]:
# Ensure 'Rating' is numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Calculate average popularity for each director
director_popularity = df.groupby('Director')['Rating'].mean().reset_index()
director_popularity.columns = ['Director', 'DirectorPopularity']

# Merge with original dataset
df = pd.merge(df, director_popularity, on='Director')



In [15]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,...,YearGroup_Pre-2000,YearGroup_2000-2010,YearGroup_Post-2010,Short,Medium,Long,Low,Medium.1,High,DirectorPopularity
0,#Gadhvi (He thought he was Gandhi),2019,109,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,...,0,0,1,0,1,0,1,0,0,7.000000
1,#Yaaram,2019,110,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,...,0,0,1,0,1,0,1,0,0,4.400000
2,...Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,...,1,0,0,0,0,1,0,1,0,5.313333
3,...Yahaan,2005,142,"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,...,0,1,0,0,0,1,0,0,1,7.383333
4,?: A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,...,0,0,1,1,0,0,0,1,0,5.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5654,Zubaan,2015,115,Drama,6.1,408.0,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana,...,0,0,1,0,1,0,0,1,0,6.100000
5655,Zubeidaa,2001,153,"Biography, Drama, History",6.2,1496.0,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee,...,0,1,0,0,0,1,0,0,1,7.250000
5656,Zulm Ki Zanjeer,1989,125,"Action, Crime, Drama",5.8,44.0,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,...,1,0,0,0,0,1,1,0,0,6.270000
5657,Zulmi,1999,129,"Action, Drama",4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,...,1,0,0,0,0,1,0,1,0,5.225000


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5659 entries, 0 to 5658
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Name                 5659 non-null   object 
 1   Year                 5659 non-null   int64  
 2   Duration             5659 non-null   int64  
 3   Genre                5659 non-null   object 
 4   Rating               5659 non-null   float64
 5   Votes                5659 non-null   float64
 6   Director             5659 non-null   object 
 7   Actor 1              5659 non-null   object 
 8   Actor 2              5659 non-null   object 
 9   Actor 3              5659 non-null   object 
 10  GenrePopularity      5659 non-null   float64
 11  Actor 1Popularity    5659 non-null   float64
 12  Actor 2Popularity    5659 non-null   float64
 13  Actor 3Popularity    5659 non-null   float64
 14  YearGroup_Pre-2000   5659 non-null   int64  
 15  YearGroup_2000-2010  5659 non-null   i

# Cleaning data

In [17]:
df.dropna(inplace=True)

# Model for Rating Prediction

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# One-hot encode categorical columns
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
already_encoded = ['YearGroup_Pre-2000', 'YearGroup_2000-2010', 'YearGroup_Post-2010',
                   'Short', 'Medium', 'Long', 'Low', 'Medium', 'High']

# Ensure these columns are not one-hot encoded again
encoded_df = pd.get_dummies(df, columns=categorical_cols)

# Define features and target
X = encoded_df.drop(['Name', 'Rating'], axis=1)
y = encoded_df['Rating']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R²: {r2}")


MSE: 0.42384944257950524, RMSE: 0.6510372052191067, MAE: 0.443071554770318, R²: 0.7711079265939172


In [19]:
import pickle

# Save the trained model to a file
filename = 'movie_rating_model.pkl'
pickle.dump(model, open(filename, 'wb'))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score (MSE): {-grid_search.best_score_}")

# Train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions with best model
y_pred_best = best_model.predict(X_test)

# Evaluate best model's performance
mse_best = mean_squared_error(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"MSE (Best Model): {mse_best}, MAE: {mae_best}, R²: {r2_best}")
