In [1]:
file_path = 'indian movies.csv'

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [83]:
import pandas as pd
df = pd.read_csv(file_path, na_values='-')

In [84]:
df.shape

(50602, 8)

In [85]:
df.sample(5)

Unnamed: 0,ID,Movie Name,Year,Timing(min),Rating(10),Votes,Genre,Language
7444,tt3374862,Coke Studio,2008–,60 min,4.9,9.0,Music,urdu
22942,tt0151150,Josh,2000,162 min,6.1,9535.0,"Action, Drama, Musical",hindi
35088,tt6994140,Dada Thakur,2001,153 min,6.2,8.0,Drama,bengali
9473,tt0158652,Heerer Prajapati,1968,,,,Drama,bengali
41531,tt0325787,Manushulu Marali,1969,,,,Drama,telugu


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50602 entries, 0 to 50601
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID           48094 non-null  object 
 1   Movie Name   50602 non-null  object 
 2   Year         49041 non-null  object 
 3   Timing(min)  24710 non-null  object 
 4   Rating(10)   23676 non-null  float64
 5   Votes        23677 non-null  object 
 6   Genre        42021 non-null  object 
 7   Language     50602 non-null  object 
dtypes: float64(1), object(7)
memory usage: 3.1+ MB


In [87]:
df.isnull().sum()

Unnamed: 0,0
ID,2508
Movie Name,0
Year,1561
Timing(min),25892
Rating(10),26926
Votes,26925
Genre,8581
Language,0


In [88]:
#Handling missing values:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

df.drop(columns=["Timing(min)"],inplace=True)
#Convert columns to numeric where needed(Year, Timing, Votes)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

In [89]:
# Select columns for imputation
impute_cols = ['Year', 'Rating(10)', 'Votes']
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=42)

# Impute only numerical columns
df[impute_cols] = imputer.fit_transform(df[impute_cols])




In [90]:
df[impute_cols].isnull().sum()

Unnamed: 0,0
Year,0
Rating(10),0
Votes,0


In [91]:
df.isnull().sum()

Unnamed: 0,0
ID,2508
Movie Name,0
Year,0
Rating(10),0
Votes,0
Genre,8581
Language,0


In [92]:
# Drop columns if they still seem unhelpful (too many nulls or poor correlation)
corr_matrix = df.corr(numeric_only=True)
low_corr_cols = [col for col in impute_cols if abs(corr_matrix['Year'][col]) < 0.5]

df.drop(columns=low_corr_cols, inplace=True)

# Final check
print(df.info())
print(low_corr_cols)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50602 entries, 0 to 50601
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          48094 non-null  object 
 1   Movie Name  50602 non-null  object 
 2   Year        50602 non-null  float64
 3   Genre       42021 non-null  object 
 4   Language    50602 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.9+ MB
None
['Rating(10)', 'Votes']


In [93]:
df.sample(5)

Unnamed: 0,ID,Movie Name,Year,Genre,Language
31741,tt8851758,Sungara Ramudu,1979.0,Drama,telugu
41409,tt7353654,Rasmeduniya,2017.0,Drama,urdu
47169,tt1579928,Apuroopam,2002.0,Romance,telugu
47964,tt14295520,Sala Sahib,1981.0,,punjabi
26239,tt6779222,Renigunta,2009.0,"Action, Crime",tamil


In [94]:
df['Year'] = df['Year'].astype('int', errors='ignore')

In [95]:
df.isnull().sum()

Unnamed: 0,0
ID,2508
Movie Name,0
Year,0
Genre,8581
Language,0


In [96]:
df.drop('ID', axis=1,inplace=True)

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# Ensure 'Year' is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Encode 'Language' to numeric
df['Language_code'] = pd.factorize(df['Language'])[0]

# Separate rows with and without missing genres
df_missing_genre = df[df['Genre'].isnull()]
df_with_genre = df[df['Genre'].notnull()]

# Encode genres for classification
df_with_genre['Genre_code'], unique_genres = pd.factorize(df_with_genre['Genre'])

# Prepare training data
X = df_with_genre[['Year', 'Language_code']]
y = df_with_genre['Genre_code']

# Train a classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Predict missing genres
X_missing = df_missing_genre[['Year', 'Language_code']]
df.loc[df['Genre'].isnull(), 'Genre'] = unique_genres[model.predict(X_missing)]

# Drop temporary columns
df.drop(columns=['Language_code'], inplace=True)

# Check remaining missing values
print(df.isnull().sum())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_genre['Genre_code'], unique_genres = pd.factorize(df_with_genre['Genre'])


Movie Name    0
Year          0
Genre         0
Language      0
dtype: int64


In [100]:
df.to_csv('cleaned_movies_data.csv', index=False)


In [101]:
df.shape

(50602, 4)

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50602 entries, 0 to 50601
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Movie Name  50602 non-null  object 
 1   Year        50602 non-null  float64
 2   Genre       50602 non-null  object 
 3   Language    50602 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.5+ MB


In [103]:
df.isnull().sum()

Unnamed: 0,0
Movie Name,0
Year,0
Genre,0
Language,0


In [104]:
df.describe()


Unnamed: 0,Year
count,50602.0
mean,1994.809355
std,21.779636
min,1913.0
25%,1981.0
50%,2003.0
75%,2013.0
max,2024.0
