Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

Load the Data

In [None]:
# Define the file path in Google Colab
file_name = '/content/IMDb Movies India.csv'

try:
    # We specify 'ISO-8859-1' encoding for this specific file
    df = pd.read_csv(file_name, encoding='ISO-8859-1')
    print(f"Successfully loaded '{file_name}'.")

    # Display the first 5 rows
    print("\n--- Data Head ---")
    print(df.head())

    # Display info about columns, data types, and non-null counts
    print("\n--- Initial Data Info ---")
    df.info()

    # Display a clear count of missing values
    print("\n--- Initial Missing Values Count ---")
    print(df.isnull().sum())

except FileNotFoundError:
    print(f"Error: The file '{file_name}' was not found.")
    print("Please make sure you have uploaded 'IMDb Movies India.csv' to your Colab session.")

Successfully loaded '/content/IMDb Movies India.csv'.

--- Data Head ---
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhat

Critical Data Cleaning

In [None]:
# 1. Handle the target variable 'Rating'
print(f"Original shape of data: {df.shape}")
df = df.dropna(subset=['Rating'])
print(f"Shape after dropping rows with missing Rating: {df.shape}")

# 2. Clean 'Year'
# Remove parentheses and convert to numeric. errors='coerce' turns invalid ones into NaN.
df['Year'] = pd.to_numeric(df['Year'].str.replace(r'[()]', '', regex=True), errors='coerce')

# 3. Clean 'Duration'
# Remove ' min' and convert to numeric.
df['Duration'] = pd.to_numeric(df['Duration'].str.replace(' min', '', regex=False), errors='coerce')

# 4. Clean 'Votes'
# Remove commas and convert to numeric.
df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', '', regex=False), errors='coerce')

print("\nCleaned 'Year', 'Duration', and 'Votes' columns.")

# Check our work
print("\n--- Data Info After Cleaning Dtypes ---")
df.info()

Original shape of data: (15509, 10)
Shape after dropping rows with missing Rating: (7919, 10)

Cleaned 'Year', 'Duration', and 'Votes' columns.

--- Data Info After Cleaning Dtypes ---
<class 'pandas.core.frame.DataFrame'>
Index: 7919 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      7919 non-null   object 
 1   Year      7919 non-null   int64  
 2   Duration  5851 non-null   float64
 3   Genre     7817 non-null   object 
 4   Rating    7919 non-null   float64
 5   Votes     7919 non-null   int64  
 6   Director  7914 non-null   object 
 7   Actor 1   7794 non-null   object 
 8   Actor 2   7719 non-null   object 
 9   Actor 3   7627 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 680.5+ KB


Handle All Remaining Missing Feature Data

In [None]:
# 1. Fill missing numerical features with their median
num_cols = ['Year', 'Duration', 'Votes']
for col in num_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)
    print(f"Filled missing '{col}' with median: {median_val}")

# 2. Fill missing text features with 'Unknown'
text_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
for col in text_cols:
    df[col] = df[col].fillna('Unknown')
    print(f"Filled missing '{col}' with 'Unknown'")

# Final check for any remaining nulls
print("\n--- Final Missing Values Count ---")
# This should now show 0 for all relevant columns
print(df.isnull().sum())

Filled missing 'Year' with median: 1997.0
Filled missing 'Duration' with median: 134.0
Filled missing 'Votes' with median: 55.0
Filled missing 'Genre' with 'Unknown'
Filled missing 'Director' with 'Unknown'
Filled missing 'Actor 1' with 'Unknown'
Filled missing 'Actor 2' with 'Unknown'
Filled missing 'Actor 3' with 'Unknown'

--- Final Missing Values Count ---
Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64


Define Features (X) and Target (y)

In [None]:
# Our target 'y' is the 'Rating' column
y = df['Rating']

# Our features 'X' are all columns *except* 'Rating' and 'Name'
X = df.drop(['Rating', 'Name'], axis=1)

print("Target (y) defined.")
print("Features (X) defined with columns:", X.columns.tolist())

# Display the head of our final, clean features
print("\n--- Final Features Head (X) ---")
print(X.head())

Target (y) defined.
Features (X) defined with columns: ['Year', 'Duration', 'Genre', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

--- Final Features Head (X) ---
   Year  Duration                      Genre  Votes        Director  \
1  2019     109.0                      Drama      8   Gaurav Bakshi   
3  2019     110.0            Comedy, Romance     35      Ovais Khan   
5  1997     147.0     Comedy, Drama, Musical    827    Rahul Rawail   
6  2005     142.0        Drama, Romance, War   1086  Shoojit Sircar   
8  2012      82.0  Horror, Mystery, Thriller    326   Allyson Patel   

           Actor 1                 Actor 2          Actor 3  
1     Rasika Dugal          Vivek Ghamande    Arvind Jangid  
3          Prateik              Ishita Raj  Siddhant Kapoor  
5       Bobby Deol  Aishwarya Rai Bachchan    Shammi Kapoor  
6  Jimmy Sheirgill          Minissha Lamba   Yashpal Sharma  
8        Yash Dave          Muntazir Ahmad     Kiran Bhatia  


Define the Preprocessing Pipeline

In [None]:
# Identify which columns are numerical and which are text
numerical_features = ['Year', 'Duration', 'Votes']
text_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

# Create the preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # Transformer for numerical data
        ('num', StandardScaler(), numerical_features),

        # Transformers for text data. We need one for each column.
        ('genre', TfidfVectorizer(), 'Genre'),
        ('director', TfidfVectorizer(), 'Director'),
        ('actor1', TfidfVectorizer(), 'Actor 1'),
        ('actor2', TfidfVectorizer(), 'Actor 2'),
        ('actor3', TfidfVectorizer(), 'Actor 3')
    ],
    remainder='passthrough' # Pass through any columns we didn't specify
)

print("ColumnTransformer preprocessor created successfully.")

ColumnTransformer preprocessor created successfully.


Create Full Pipeline and Split Data

In [None]:
# We will use a RandomForestRegressor.
# n_jobs=-1 uses all available CPU cores in Colab for faster training.
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Create the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

print("Full model pipeline created successfully.")

# Split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nData split into:")
print(f"{len(X_train)} training samples")
print(f"{len(X_test)} testing samples")

Full model pipeline created successfully.

Data split into:
6335 training samples
1584 testing samples


Train the Model

In [None]:
print("Starting model training... This may take a minute.")

# Train the pipeline
pipeline.fit(X_train, y_train)

print("Model training complete.")

Starting model training... This may take a minute.
Model training complete.


Evaluate the Model

In [None]:
print("--- Model Evaluation ---")

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²) Score: {r2:.4f}")

print("\n--- Sample Predictions vs. Actuals ---")
# Show a few predictions vs actuals to get a feel for the performance
comparison_df = pd.DataFrame({'Actual Rating': y_test, 'Predicted Rating': y_pred})
comparison_df['Predicted Rating'] = comparison_df['Predicted Rating'].round(1)
print(comparison_df.head(10))

--- Model Evaluation ---
Mean Absolute Error (MAE): 0.7835
R-squared (R²) Score: 0.4071

--- Sample Predictions vs. Actuals ---
       Actual Rating  Predicted Rating
9456             3.3               4.3
14816            5.3               6.0
3213             5.7               5.4
3778             7.2               6.6
5775             3.5               5.7
12203            7.2               6.5
12305            3.8               5.6
4650             6.9               7.0
8701             5.2               5.6
4707             7.4               6.5


Example Prediction on New Data

In [None]:
print("--- Example Prediction ---")
# Let's create a new, hypothetical movie to see what the model predicts.
# All lists must have the same length (1, in this case).
# We combine 'Action' and 'Drama' into a single string, just like
# the original data (e.g., "Action, Drama").
new_movie = pd.DataFrame({
    'Year': [2024],
    'Duration': [150],
    'Genre': ['Action Drama'],  # This is now a single string inside a list of length 1
    'Votes': [10000],
    'Director': ['A.R. Murugadoss'],
    'Actor 1': ['Shah Rukh Khan'],
    'Actor 2': ['Nayanthara'],
    'Actor 3': ['Vijay Sethupathi']
})

# Use the *trained pipeline* to predict
predicted_rating = pipeline.predict(new_movie)
print(f"Predicted rating for the new movie: {predicted_rating[0]:.2f}")

--- Example Prediction ---
Predicted rating for the new movie: 5.94
