In [1]:
# Step 1: Import the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt


In [7]:
# Step 2: Load dataset
df = pd.read_csv("MoviesIndia.csv", encoding="latin1")

In [8]:
# Show first 5 rows
print("First 5 rows:")
print(df.head())

First 5 rows:
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant 

In [9]:
# Check basic info
print("\nDataset Info:")
print(df.info())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


In [10]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


In [11]:
# Summary statistics
print("\nSummary Statistics:")
print(df.describe())


Summary Statistics:
            Rating
count  7919.000000
mean      5.841621
std       1.381777
min       1.100000
25%       4.900000
50%       6.000000
75%       6.800000
max      10.000000


In [13]:
# Drop rows where Rating is missing (target variable)
df = df.dropna(subset=['Rating'])

In [16]:
# Fill missing categorical columns with 'Unknown'
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')


In [18]:
# Convert to numeric, forcing errors to NaN
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Duration'] = df['Duration'].astype(str).str.extract('(\d+)').astype(float)  # Extract only numbers from duration
df['Votes'] = df['Votes'].astype(str).str.replace(',', '', regex=True)  # Remove commas
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Step 2: Fill missing numeric values with median
numeric_cols = ['Year', 'Duration', 'Votes']
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Verify the cleaning
print(df[numeric_cols].dtypes)
print("Remaining Missing Values:\n", df.isnull().sum())


Year        float64
Duration    float64
Votes         int64
dtype: object
Remaining Missing Values:
 Name           0
Year        7919
Duration       0
Genre          0
Rating         0
Votes          0
Director       0
Actor 1        0
Actor 2        0
Actor 3        0
dtype: int64


  df['Duration'] = df['Duration'].astype(str).str.extract('(\d+)').astype(float)  # Extract only numbers from duration
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [19]:
# Fill missing Year with median year
df['Year'] = df['Year'].fillna(df['Year'].median())

# Verify again
print("Remaining Missing Values:\n", df.isnull().sum())


Remaining Missing Values:
 Name           0
Year        7919
Duration       0
Genre          0
Rating         0
Votes          0
Director       0
Actor 1        0
Actor 2        0
Actor 3        0
dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
