In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [2]:
df=pd.read_csv('IMDB_Movies_Dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Average Rating,Director,Writer,Metascore,Cast,Release Date,Country of Origin,Languages,Budget,Worldwide Gross,Runtime
0,0,The Shawshank Redemption,9.3,Frank Darabont,"Stephen King, Frank Darabont",82.0,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","October 14, 1994 (India)",United States,English,"$25,000,000 (estimated)","$29,332,133",2 hours 22 minutes
1,1,Attack on Titan the Movie: The Last Attack,9.3,Yûichirô Hayashi,"Natsuki Hanae, Yoshimasa Hosoya, Marina Inoue",,"Natsuki Hanae, Yoshimasa Hosoya, Marina Inoue,...","November 8, 2024 (Japan)",Japan,Japanese,,"$3,513,659",2 hours 25 minutes
2,2,The Godfather,9.2,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola",100.0,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1978 (India),United States,"English, Italian, Latin","$6,000,000 (estimated)","$250,342,198",2 hours 55 minutes
3,3,Hababam Sinifi,9.2,Ertem Egilmez,"Umur Bugay, Rifat Ilgaz",,"Münir Özkul, Tarik Akan, Halit Akçatepe, Kemal...","April 1, 1975 (Turkey)",Turkey,Turkish,,,1 hour 25 minutes
4,4,Ramayana: The Legend of Prince Rama,9.2,"Ram Mohan, Yûgô Sakô, Koichi Saski","Rani Burra, Ram Mohan, Yûgô Sakô",,"Arun Govil, Nikhil Kapoor, Edie Mirman, Rael P...","September 25, 2001 (India)","India, Japan",English,,,2 hours 15 minutes


In [4]:
categorical_col=df.select_dtypes(include=['object', 'category']).columns
categorical_col


Index(['Title', 'Director', 'Writer', 'Cast', 'Release Date',
       'Country of Origin', 'Languages', 'Budget', 'Worldwide Gross',
       'Runtime'],
      dtype='object')

In [5]:
cardinality = df[categorical_col].nunique()
cardinality

Title                4927
Director             2648
Writer               4422
Cast                 4981
Release Date         4516
Country of Origin     812
Languages            1084
Budget                882
Worldwide Gross      3865
Runtime               182
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4989 entries, 0 to 4988
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4989 non-null   int64  
 1   Title              4989 non-null   object 
 2   Average Rating     4989 non-null   float64
 3   Director           4989 non-null   object 
 4   Writer             4988 non-null   object 
 5   Metascore          3055 non-null   float64
 6   Cast               4982 non-null   object 
 7   Release Date       4989 non-null   object 
 8   Country of Origin  4986 non-null   object 
 9   Languages          4968 non-null   object 
 10  Budget             2651 non-null   object 
 11  Worldwide Gross    3895 non-null   object 
 12  Runtime            4989 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 506.8+ KB


In [7]:
df['Worldwide Gross'].value_counts()

Worldwide Gross
$509            7
$46,961         4
$3,389          3
$8,148          3
$1,843          2
               ..
$5,080,409      1
$10,726,612     1
$119,418,501    1
$79,226         1
$550,031        1
Name: count, Length: 3865, dtype: int64

In [8]:
df['Worldwide Gross'].unique()

array(['$29,332,133', '$3,513,659', '$250,342,198', ..., '$29,893,636',
       '$17,871,370', '$550,031'], dtype=object)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4989 entries, 0 to 4988
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4989 non-null   int64  
 1   Title              4989 non-null   object 
 2   Average Rating     4989 non-null   float64
 3   Director           4989 non-null   object 
 4   Writer             4988 non-null   object 
 5   Metascore          3055 non-null   float64
 6   Cast               4982 non-null   object 
 7   Release Date       4989 non-null   object 
 8   Country of Origin  4986 non-null   object 
 9   Languages          4968 non-null   object 
 10  Budget             2651 non-null   object 
 11  Worldwide Gross    3895 non-null   object 
 12  Runtime            4989 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 506.8+ KB


In [10]:
df['Budget'].unique()

array(['$25,000,000 (estimated)', nan, '$6,000,000 (estimated)',
       '$185,000,000 (estimated)', '$22,000,000 (estimated)',
       '$350,000 (estimated)', '$13,000,000 (estimated)',
       '$94,000,000 (estimated)', '$93,000,000 (estimated)',
       '$8,000,000 (estimated)', 'R$8,000,000 (estimated)',
       'BDT\xa035,000,000 (estimated)', '$160,000,000 (estimated)',
       '$63,000,000 (estimated)', '$55,000,000 (estimated)',
       '$1,200,000 (estimated)', '$165,000,000 (estimated)',
       '$3,000,000 (estimated)', '$18,000,000 (estimated)',
       '₹150,000,000 (estimated)', '₹10,000,000 (estimated)',
       '₹600,000,000 (estimated)', '₹25,000,000 (estimated)',
       '₹70,000,000 (estimated)', '₹9,800,000 (estimated)',
       '$33,000,000 (estimated)', '$19,000,000 (estimated)',
       '$3,180,000 (estimated)', '$150,000,000 (estimated)',
       '$70,000,000 (estimated)', '$60,000,000 (estimated)',
       'R$3,300,000 (estimated)', '$102,000,000 (estimated)',
       '$11,000

In [11]:
# Step 1: Check unique values in 'Budget'
print("Unique values before cleaning:")
print(df['Budget'].unique())

# Step 2: Clean the 'Budget' column by removing currency symbols and non-numeric characters
df['Budget'] = df['Budget'].replace({
    r'\$': '', r'₹': '', r'€': '', r'R\$': '', r'BDT\xa0': '', r'PKR\xa0': '', r'¥': '', r'HK\$': '',
    r'€': '', r'HUF\xa0': '', r'THB\xa0': '', r',': '', r' \(estimated\)': ''
}, regex=True)

# Step 3: Convert the cleaned values to numeric, invalid parsing will result in NaN
df['Budget'] = pd.to_numeric(df['Budget'], errors='coerce')

# Step 4: Handle missing values by filling NaN with the mean of the column
mean_value = df['Budget'].mean()  # Calculate the mean
df['Budget'] = df['Budget'].fillna(mean_value).astype(int)

# Check the cleaned column and its unique values
print("\nUnique values after cleaning:")
print(df['Budget'].unique())


Unique values before cleaning:
['$25,000,000 (estimated)' nan '$6,000,000 (estimated)'
 '$185,000,000 (estimated)' '$22,000,000 (estimated)'
 '$350,000 (estimated)' '$13,000,000 (estimated)'
 '$94,000,000 (estimated)' '$93,000,000 (estimated)'
 '$8,000,000 (estimated)' 'R$8,000,000 (estimated)'
 'BDT\xa035,000,000 (estimated)' '$160,000,000 (estimated)'
 '$63,000,000 (estimated)' '$55,000,000 (estimated)'
 '$1,200,000 (estimated)' '$165,000,000 (estimated)'
 '$3,000,000 (estimated)' '$18,000,000 (estimated)'
 '₹150,000,000 (estimated)' '₹10,000,000 (estimated)'
 '₹600,000,000 (estimated)' '₹25,000,000 (estimated)'
 '₹70,000,000 (estimated)' '₹9,800,000 (estimated)'
 '$33,000,000 (estimated)' '$19,000,000 (estimated)'
 '$3,180,000 (estimated)' '$150,000,000 (estimated)'
 '$70,000,000 (estimated)' '$60,000,000 (estimated)'
 'R$3,300,000 (estimated)' '$102,000,000 (estimated)'
 '$11,000,000 (estimated)' '$20,000,000 (estimated)'
 '¥125,000,000 (estimated)' '₹200,000,000 (estimated)'
 'PKR

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4989 entries, 0 to 4988
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4989 non-null   int64  
 1   Title              4989 non-null   object 
 2   Average Rating     4989 non-null   float64
 3   Director           4989 non-null   object 
 4   Writer             4988 non-null   object 
 5   Metascore          3055 non-null   float64
 6   Cast               4982 non-null   object 
 7   Release Date       4989 non-null   object 
 8   Country of Origin  4986 non-null   object 
 9   Languages          4968 non-null   object 
 10  Budget             4989 non-null   int32  
 11  Worldwide Gross    3895 non-null   object 
 12  Runtime            4989 non-null   object 
dtypes: float64(2), int32(1), int64(1), object(9)
memory usage: 487.3+ KB


In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Average Rating,Director,Writer,Metascore,Cast,Release Date,Country of Origin,Languages,Budget,Worldwide Gross,Runtime
0,0,The Shawshank Redemption,9.3,Frank Darabont,"Stephen King, Frank Darabont",82.0,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","October 14, 1994 (India)",United States,English,25000000,"$29,332,133",2 hours 22 minutes
1,1,Attack on Titan the Movie: The Last Attack,9.3,Yûichirô Hayashi,"Natsuki Hanae, Yoshimasa Hosoya, Marina Inoue",,"Natsuki Hanae, Yoshimasa Hosoya, Marina Inoue,...","November 8, 2024 (Japan)",Japan,Japanese,55151216,"$3,513,659",2 hours 25 minutes
2,2,The Godfather,9.2,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola",100.0,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1978 (India),United States,"English, Italian, Latin",6000000,"$250,342,198",2 hours 55 minutes
3,3,Hababam Sinifi,9.2,Ertem Egilmez,"Umur Bugay, Rifat Ilgaz",,"Münir Özkul, Tarik Akan, Halit Akçatepe, Kemal...","April 1, 1975 (Turkey)",Turkey,Turkish,55151216,,1 hour 25 minutes
4,4,Ramayana: The Legend of Prince Rama,9.2,"Ram Mohan, Yûgô Sakô, Koichi Saski","Rani Burra, Ram Mohan, Yûgô Sakô",,"Arun Govil, Nikhil Kapoor, Edie Mirman, Rael P...","September 25, 2001 (India)","India, Japan",English,55151216,,2 hours 15 minutes


In [14]:
df['Runtime'].unique()

array(['2 hours 22 minutes', '2 hours 25 minutes', '2 hours 55 minutes',
       '1 hour 25 minutes', '2 hours 15 minutes', '56 minutes',
       '3 hours 12 minutes', '2 hours 32 minutes', '3 hours 15 minutes',
       '1 hour 36 minutes', '3 hours 22 minutes', '3 hours 21 minutes',
       '2 hours 18 minutes', '1 hour 40 minutes', '2 hours 58 minutes',
       '2 hours 34 minutes', '2 hours 16 minutes', '7 hours 47 minutes',
       '3 hours 27 minutes', '1 hour 30 minutes', '2 hours 27 minutes',
       '1 hour 35 minutes', '2 hours 4 minutes', '2 hours 28 minutes',
       '2 hours 19 minutes', '2 hours 41 minutes', '2 hours 59 minutes',
       '2 hours 17 minutes', '1 hour 27 minutes', '3 hours 10 minutes',
       '2 hours 38 minutes', '1 hour 32 minutes', '2 hours 2 minutes',
       '2 hours 46 minutes', '1 hour 20 minutes', '3 hours 4 minutes',
       '2 hours 49 minutes', '2 hours 13 minutes', '2 hours 44 minutes',
       '4 hours 7 minutes', '1 hour 28 minutes', '2 hours 9 minutes',


In [15]:
# Step 1: Check unique values in 'Runtime'
print("Unique values before cleaning:")
print(df['Runtime'].unique())

# Step 2: Extract hours and minutes from 'Runtime' and convert to total minutes
def convert_runtime_to_minutes(runtime):
    if pd.isna(runtime):
        return None
    hours = 0
    minutes = 0
    # Extract hours and minutes
    if 'hour' in runtime:
        hours = int(runtime.split('hour')[0].strip())
    if 'minute' in runtime:
        minutes = int(runtime.split('minute')[0].split()[-1].strip())
    # Calculate total minutes
    total_minutes = hours * 60 + minutes
    return total_minutes

# Apply the conversion function to the 'Runtime' column
df['Runtime'] = df['Runtime'].apply(convert_runtime_to_minutes)

# Step 3: Handle missing values by filling NaN with the mean of the column
mean_runtime = df['Runtime'].mean()  # Calculate the mean
df['Runtime'] = df['Runtime'].fillna(mean_runtime).astype(int)

# Check the cleaned column and its unique values
print("\nUnique values after cleaning:")
print(df['Runtime'].unique())


Unique values before cleaning:
['2 hours 22 minutes' '2 hours 25 minutes' '2 hours 55 minutes'
 '1 hour 25 minutes' '2 hours 15 minutes' '56 minutes'
 '3 hours 12 minutes' '2 hours 32 minutes' '3 hours 15 minutes'
 '1 hour 36 minutes' '3 hours 22 minutes' '3 hours 21 minutes'
 '2 hours 18 minutes' '1 hour 40 minutes' '2 hours 58 minutes'
 '2 hours 34 minutes' '2 hours 16 minutes' '7 hours 47 minutes'
 '3 hours 27 minutes' '1 hour 30 minutes' '2 hours 27 minutes'
 '1 hour 35 minutes' '2 hours 4 minutes' '2 hours 28 minutes'
 '2 hours 19 minutes' '2 hours 41 minutes' '2 hours 59 minutes'
 '2 hours 17 minutes' '1 hour 27 minutes' '3 hours 10 minutes'
 '2 hours 38 minutes' '1 hour 32 minutes' '2 hours 2 minutes'
 '2 hours 46 minutes' '1 hour 20 minutes' '3 hours 4 minutes'
 '2 hours 49 minutes' '2 hours 13 minutes' '2 hours 44 minutes'
 '4 hours 7 minutes' '1 hour 28 minutes' '2 hours 9 minutes'
 '1 hour 56 minutes' '2 hours 33 minutes' '9 hours 26 minutes'
 '2 hours 20 minutes' '2 hours 3

In [16]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4989 entries, 0 to 4988
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4989 non-null   int64  
 1   Title              4989 non-null   object 
 2   Average Rating     4989 non-null   float64
 3   Director           4989 non-null   object 
 4   Writer             4988 non-null   object 
 5   Metascore          3055 non-null   float64
 6   Cast               4982 non-null   object 
 7   Release Date       4989 non-null   object 
 8   Country of Origin  4986 non-null   object 
 9   Languages          4968 non-null   object 
 10  Budget             4989 non-null   int32  
 11  Worldwide Gross    3895 non-null   object 
 12  Runtime            4989 non-null   int32  
dtypes: float64(2), int32(2), int64(1), object(8)
memory usage: 467.8+ KB


In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Average Rating,Director,Writer,Metascore,Cast,Release Date,Country of Origin,Languages,Budget,Worldwide Gross,Runtime
0,0,The Shawshank Redemption,9.3,Frank Darabont,"Stephen King, Frank Darabont",82.0,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","October 14, 1994 (India)",United States,English,25000000,"$29,332,133",142
1,1,Attack on Titan the Movie: The Last Attack,9.3,Yûichirô Hayashi,"Natsuki Hanae, Yoshimasa Hosoya, Marina Inoue",,"Natsuki Hanae, Yoshimasa Hosoya, Marina Inoue,...","November 8, 2024 (Japan)",Japan,Japanese,55151216,"$3,513,659",145
2,2,The Godfather,9.2,Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola",100.0,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1978 (India),United States,"English, Italian, Latin",6000000,"$250,342,198",175
3,3,Hababam Sinifi,9.2,Ertem Egilmez,"Umur Bugay, Rifat Ilgaz",,"Münir Özkul, Tarik Akan, Halit Akçatepe, Kemal...","April 1, 1975 (Turkey)",Turkey,Turkish,55151216,,85
4,4,Ramayana: The Legend of Prince Rama,9.2,"Ram Mohan, Yûgô Sakô, Koichi Saski","Rani Burra, Ram Mohan, Yûgô Sakô",,"Arun Govil, Nikhil Kapoor, Edie Mirman, Rael P...","September 25, 2001 (India)","India, Japan",English,55151216,,135


In [18]:
categorical_col=df.select_dtypes(include=['object', 'category']).columns
categorical_col
LabelEncoder=LabelEncoder
for col in categorical_col:
    df[col]=LabelEncoder().fit_transform(df[col])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4989 entries, 0 to 4988
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4989 non-null   int64  
 1   Title              4989 non-null   int32  
 2   Average Rating     4989 non-null   float64
 3   Director           4989 non-null   int32  
 4   Writer             4989 non-null   int32  
 5   Metascore          3055 non-null   float64
 6   Cast               4989 non-null   int32  
 7   Release Date       4989 non-null   int32  
 8   Country of Origin  4989 non-null   int32  
 9   Languages          4989 non-null   int32  
 10  Budget             4989 non-null   int32  
 11  Worldwide Gross    4989 non-null   int32  
 12  Runtime            4989 non-null   int32  
dtypes: float64(2), int32(10), int64(1)
memory usage: 311.9 KB


In [20]:
for column in df.columns:
    if df[column].isnull().sum()>0:
        if df[column].dtype=='object':
            df[column].fillna(df[column].mode()[0], inplace=True)
        else:
            df[column].fillna(df[column].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4989 entries, 0 to 4988
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         4989 non-null   int64  
 1   Title              4989 non-null   int32  
 2   Average Rating     4989 non-null   float64
 3   Director           4989 non-null   int32  
 4   Writer             4989 non-null   int32  
 5   Metascore          4989 non-null   float64
 6   Cast               4989 non-null   int32  
 7   Release Date       4989 non-null   int32  
 8   Country of Origin  4989 non-null   int32  
 9   Languages          4989 non-null   int32  
 10  Budget             4989 non-null   int32  
 11  Worldwide Gross    4989 non-null   int32  
 12  Runtime            4989 non-null   int32  
dtypes: float64(2), int32(10), int64(1)
memory usage: 311.9 KB


In [22]:
x=df.drop('Worldwide Gross', axis=1)
y=df['Worldwide Gross']

In [23]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.3, random_state=42)
x_val,x_test,y_val,y_test=train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [24]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

In [28]:
model=LinearRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [30]:
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print(mse)
print(r2)

1652798.7293049395
-0.0037142098566438353


In [27]:
# Cross-validation with the final model
cv_scores_final = cross_val_score(model, x_train_scaled, y_train, cv=5)
print("\nFinal Model Cross-Validation Accuracy:", cv_scores_final.mean())


Final Model Cross-Validation Accuracy: -0.0017046934885021291


In [None]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(x_train_scaled, y_train)
y_val_pred=dt_model.predict(x_val_scaled)
accuracy=accuracy_score(y_val,y_val_pred)
accuracy

0.07620320855614973

In [29]:
# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train_scaled, y_train)
y_val_pred=rf_model.predict(x_val_scaled)
accuracy=accuracy_score(y_val,y_val_pred)
accuracy

0.20855614973262032

In [30]:
# K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier()
knn_model.fit(x_train_scaled, y_train)
y_val_pred=knn_model.predict(x_val_scaled)
accuracy=accuracy_score(y_val,y_val_pred)
accuracy

0.10695187165775401

In [31]:
from sklearn.model_selection import cross_val_score

# Decision Tree Cross-Validation
cv_scores_dt = cross_val_score(dt_model, x_train_scaled, y_train, cv=5)
print("Decision Tree Cross-Validation Scores:", cv_scores_dt)
print("Mean Cross-Validation Score for Decision Tree:", cv_scores_dt.mean())

# Random Forest Cross-Validation
cv_scores_rf = cross_val_score(rf_model, x_train_scaled, y_train, cv=5)
print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Mean Cross-Validation Score for Random Forest:", cv_scores_rf.mean())

# KNN Cross-Validation
cv_scores_knn = cross_val_score(knn_model, x_train_scaled, y_train, cv=5)
print("KNN Cross-Validation Scores:", cv_scores_knn)
print("Mean Cross-Validation Score for KNN:", cv_scores_knn.mean())



Decision Tree Cross-Validation Scores: [0.08869814 0.08154506 0.0773639  0.08166189 0.09312321]
Mean Cross-Validation Score for Decision Tree: 0.0844784403425278




Random Forest Cross-Validation Scores: [0.21459227 0.20600858 0.21060172 0.21203438 0.20487106]
Mean Cross-Validation Score for Random Forest: 0.20962160433857618




KNN Cross-Validation Scores: [0.10872675 0.10729614 0.10458453 0.10744986 0.10028653]
Mean Cross-Validation Score for KNN: 0.1056687613496153
