In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
# Replace 'your_dataset.csv' with your actual dataset filename
df = pd.read_csv('movie.csv')
df.sample(30)

Unnamed: 0,Movie Name,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,Number of Screens,Revenue(INR),Budget(INR)
847,Rang-E-Ishq,Normal,No,No,love_story,Yes,Yes,Yes,Muzahid Khan,Srinivasa Anjanappa,Deen Mohammad,50,20000000,1300000
1349,Chance Pe Dance,Normal,No,No,rom__com,No,No,No,Shahid Kapoor,Ken Ghosh,Adnan Sami,800,290000000,163027500
744,Superstar,Holiday,No,No,drama,No,Yes,No,Kunal Khemu,Rohit Jugraj,Shamir Tandon,375,90000000,31900000
789,Chittagong,Normal,No,No,drama,No,Yes,No,Manoj Bajpai,Bedabrata Pain,Shankar - Ehsaan - Loy,65,50000000,4750000
1309,Jai Singh Are Baap Re,Normal,No,No,drama,Yes,Yes,Yes,Sankalp Shrivastava,Sankalp Shrivastava,Sankalp Shrivastava,4,1500000,150000
1382,Us Disha Mein,Normal,No,No,horror,Yes,Yes,Yes,Shahzad Pathan,Shahzad Pathan,Vikrant Mathur,1,2500000,300000
1032,Ashok Chakra Tribute To Real Heroes,Normal,No,No,drama,No,Yes,Yes,Rajan Verma,S.P. Muneshwar,Arun Bakshi,25,12500000,450000
1108,Roy,Holiday,No,No,thriller,No,Yes,No,Ranbir Kapoor,Vikramjit Singh,Ankit Tiwari,2350,500000000,560215000
363,Creature 3D,Normal,No,Yes,horror,No,No,Yes,Bipasha Basu,Vikram Bhatt,Tony Kakkar,1300,250000000,206350000
1286,Zilla Ghaziabad,Holiday,No,No,action,No,No,No,Sanjay Dutt,Anand Kumar,Amjad - Nadeem,1700,360000000,222620000


In [26]:
# Check for missing data
missing_values = df.isnull().sum()
print(missing_values)


Movie Name            0
Release Period        0
Whether Remake        0
Whether Franchise     0
Genre                 0
New Actor             0
New Director          0
New Music Director    0
Lead Star             0
Director              0
Music Director        0
Number of Screens     0
Revenue(INR)          0
Budget(INR)           0
dtype: int64


In [27]:
df['Release Period'].value_counts()

Unnamed: 0_level_0,count
Release Period,Unnamed: 1_level_1
Normal,1064
Holiday,634


In [28]:
df.sample(5)

Unnamed: 0,Movie Name,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,Number of Screens,Revenue(INR),Budget(INR)
774,Sau Jhooth Ek Sach,Normal,No,No,drama,Yes,Yes,Yes,Mammootty,Bappaditya Roy,Faizal Qureshi,3,5000000,180000
500,Ramaiya Vastavaiya,Normal,Yes,No,love_story,Yes,No,No,Girish Taurani,Prabhu Deva,Sachin - Jigar,1400,380000000,383420000
948,Tutak Tutak Tutiya,Holiday,No,No,comedy,No,Yes,No,Prabhu Deva,A.L. Vijay,Sajid - Wajid,875,110000000,69405000
1552,Barwali,Normal,No,No,adult,No,No,No,Tanveer Hashmi,Suresh Jain,Afsar - Sajid,1,1000000,1725000
1248,Bajrangi Bhaijaan,Holiday,No,No,masala,No,No,No,Salman Khan,Kabir Khan,Pritam,4100,1250000000,6039940000


In [29]:
from sklearn.model_selection import train_test_split

df = df.drop('Movie Name', axis=1)
# Assuming 'target' is the column you want to predict
X = df.drop('Revenue(INR)', axis=1)
y = df['Revenue(INR)']

# Splitting into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler

# Define the column transformer with multiple preprocessing options
transformer = ColumnTransformer(
    transformers=[
        # Ordinal encoding for ordered categorical features
        ('ordinal',
         OrdinalEncoder(categories=[['Normal', 'Holiday']]),
         ['Release Period']),

        # Label encoding for non-ordered categorical features (applied as a pipeline step)
        # Sklearn's ColumnTransformer does not support LabelEncoder directly
        ('label',
         OrdinalEncoder(),
         ['Whether Remake', 'Whether Franchise', 'New Actor', 'New Director', 'New Music Director']),  # Replace 'NonOrderedColumn' with your actual column names for label encoding

        # One-hot encoding for nominal categorical features
        ('onehot',
         OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'),
         ['Lead Star', 'Director', 'Music Director','Genre']),

    ],
    # Default action for columns not explicitly mentioned
    remainder='passthrough'  # Keeps the columns not specified as-is
)

# Apply the column transformer to training and testing data
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)




In [31]:
# Replace 'hmv_transformer' with 'transformer' in the line where the error occurred
col_names = transformer.get_feature_names_out()

#In modified col names this code removes everything before the double underscore (), leaving just the base column names
modified_columns = [col.split('__')[-1] for col in col_names]

#Transforming Array into dataframes
X_train = pd.DataFrame(X_train_transformed, columns = modified_columns) # Use X_train_transformed here
X_test = pd.DataFrame(X_test_transformed, columns = modified_columns)
test_df = pd.DataFrame(X_test_transformed, columns = modified_columns)

In [32]:
X_train

Unnamed: 0,Release Period,Whether Remake,Whether Franchise,New Actor,New Director,New Music Director,Lead Star_Aadil Khan,Lead Star_Aakar Kaushik,Lead Star_Aakash,Lead Star_Aamir Bashir,...,Genre_fantasy,Genre_horror,Genre_love_story,Genre_masala,Genre_mythological,com,Genre_suspense,Genre_thriller,Number of Screens,Budget(INR)
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125.0,2725000.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,87000.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2100.0,952662500.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2000.0,314550000.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,165.0,13500000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1150.0,494980000.0
1354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1000.0,115800000.0
1355,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,225.0,6500000.0
1356,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,625.0,255262500.0


In [33]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [34]:
# Instantiate regression algorithms
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.1)
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
dtr = DecisionTreeRegressor(max_depth=5)
knr = KNeighborsRegressor()
rfr = RandomForestRegressor(n_estimators=50, random_state=2)
abr = AdaBoostRegressor(n_estimators=50, random_state=2)
bgr = BaggingRegressor(n_estimators=50, random_state=2)
etr = ExtraTreesRegressor(n_estimators=50, random_state=2)
gbr = GradientBoostingRegressor(n_estimators=50, random_state=2)
xgbr = XGBRegressor(n_estimators=50, random_state=2)


In [35]:
# Dictionary of regression algorithms
regressors = {
    'Linear Regression': lr,
    'Ridge': ridge,
    'Lasso': lasso,
    'ElasticNet': elastic_net,
    'Decision Tree': dtr,
    'KNeighbors': knr,
    'Random Forest': rfr,
    'AdaBoost': abr,
    'Bagging': bgr,
    'ExtraTrees': etr,
    'GradientBoosting': gbr,
    'XGBoost': xgbr
}


In [36]:
def train_regressor(regressor, X_train, y_train, X_test, y_test):
    # Fit the model
    regressor.fit(X_train, y_train)
    # Predict on test data
    y_pred = regressor.predict(X_test)
    # Calculate regression evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mae, mse, r2


In [37]:
# Initialize lists to store metrics
mae_scores = []
mse_scores = []
r2_scores = []

# Iterate over regressors
for name, regressor in regressors.items():
    print(name)
    current_mae, current_mse, current_r2 = train_regressor(regressor, X_train, y_train, X_test, y_test)

    print("For", name)
    print("Mean Absolute Error - ", current_mae)
    print("Mean Squared Error - ", current_mse)
    print("R2 Score - ", current_r2)

    # Append metrics to respective lists
    mae_scores.append(current_mae)
    mse_scores.append(current_mse)
    r2_scores.append(current_r2)


Linear Regression
For Linear Regression
Mean Absolute Error -  4855217614.872962
Mean Squared Error -  1.709698346651943e+21
R2 Score -  -28002.90526705179
Ridge




For Ridge
Mean Absolute Error -  51366908.682977125
Mean Squared Error -  1.1125482476429894e+16
R2 Score -  0.8177708027089684
Lasso


  model = cd_fast.enet_coordinate_descent(


For Lasso
Mean Absolute Error -  122704087.36748566
Mean Squared Error -  2.8984716019492732e+16
R2 Score -  0.5252465189594558
ElasticNet
For ElasticNet
Mean Absolute Error -  42464923.76348782
Mean Squared Error -  8505400956697743.0
R2 Score -  0.8606862765492606
Decision Tree
For Decision Tree
Mean Absolute Error -  48667974.63609207
Mean Squared Error -  1.0208778353007296e+16
R2 Score -  0.8327859049230599
KNeighbors
For KNeighbors
Mean Absolute Error -  64757485.294117644
Mean Squared Error -  1.6075794537573526e+16
R2 Score -  0.7366874523775594
Random Forest
For Random Forest
Mean Absolute Error -  40256600.0
Mean Squared Error -  7465522860413235.0
R2 Score -  0.8777188997337303
AdaBoost
For AdaBoost
Mean Absolute Error -  159115919.38997883
Mean Squared Error -  3.0950140599620944e+16
R2 Score -  0.4930539606293691
Bagging
For Bagging
Mean Absolute Error -  40500141.176470585
Mean Squared Error -  7706941515130883.0
R2 Score -  0.8737645968301501
ExtraTrees
For ExtraTrees
Me