## Data Collection and Loading

In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('boxoffice1.csv')

# Display the first few rows to understand the data
print(df.head())


             Title   Genre    Budget ReleaseDate  Popularity  Duration  \
0         Baaghi 3  Action  60000000      6/3/20         7.0       140   
1       Good Newwz  Comedy  45000000    27/12/19         7.5       132   
2          Tanhaji  Action  70000000     10/1/20         8.0       135   
3   Angrezi Medium  Comedy  40000000     13/3/20         7.2       140   
4  Shakuntala Devi   Drama  65000000     31/7/20         7.4       127   

        Director        Actor1               Actor2 Language Certification  \
0     Ahmed Khan  Tiger Shroff      Shraddha Kapoor    Hindi           U/A   
1      Raj Mehta  Akshay Kumar  Kareena Kapoor Khan    Hindi             U   
2        Om Raut    Ajay Devgn        Saif Ali Khan    Hindi             U   
3  Homi Adajania   Irrfan Khan        Radhika Madan    Hindi           U/A   
4      Anu Menon   Vidya Balan      Jisshu Sengupta    Hindi           U/A   

         State  Box Office Collection (INR)  Population  Urbanization  \
0  Maharashtr

## Data Pre-Processing

In [7]:
# Check for missing values
print(df.isnull().sum())


Title                          0
Genre                          0
Budget                         0
ReleaseDate                    0
Popularity                     0
Duration                       0
Director                       0
Actor1                         0
Actor2                         0
Language                       0
Certification                  0
State                          0
Box Office Collection (INR)    0
Population                     0
Urbanization                   0
TheaterCount                   0
dtype: int64


In [9]:
from sklearn.preprocessing import LabelEncoder
import pickle
import joblib
# Only encode required categorical columns
categorical_columns = ['Actor1', 'Actor2', 'Director']

# Initialize LabelEncoder
label_encoders = {}

# Apply label encoding to the required categorical columns
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Apply encoding to the dataframe
    label_encoders[col] = le  # Save the LabelEncoder for later use

# Save the label encoders (optional)
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)


In [11]:
df

Unnamed: 0,Title,Genre,Budget,ReleaseDate,Popularity,Duration,Director,Actor1,Actor2,Language,Certification,State,Box Office Collection (INR),Population,Urbanization,TheaterCount
0,Baaghi 3,Action,60000000,6/3/20,7.0,140,10,82,91,Hindi,U/A,Maharashtra,1370000000,10.0,75,1500
1,Good Newwz,Comedy,45000000,27/12/19,7.5,132,90,7,33,Hindi,U,Punjab,3180000000,9.0,78,1400
2,Tanhaji,Action,70000000,10/1/20,8.0,135,75,5,81,Hindi,U,Maharashtra,3670000000,11.0,70,1600
3,Angrezi Medium,Comedy,40000000,13/3/20,7.2,140,41,24,72,Hindi,U/A,Delhi,107200000,9.0,75,1400
4,Shakuntala Devi,Drama,65000000,31/7/20,7.4,127,20,87,30,Hindi,U/A,West Bengal,20000000,14.0,60,1300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,Sivakumarin Sabadham,"Comedy, Drama",25000000,10/3/21,6.8,120,99,74,60,Tamil,U/A,Tamil Nadu,30000000,77.0,48,1200
191,Maanadu,"Action, Sci-Fi",50000000,25/11/21,7.8,150,132,73,79,Tamil,U/A,Tamil Nadu,80000000,77.0,48,1800
192,Puducherry,Drama,25000000,14/1/22,6.9,130,99,32,52,Tamil,U/A,Tamil Nadu,30000000,77.0,48,1400
193,Bheemaa,"Action, Drama",40000000,7/10/22,7.5,150,40,94,53,Tamil,U/A,Tamil Nadu,60000000,77.0,48,1600


In [14]:
# Convert 'Release Date' to datetime format
df['ReleaseDate'] = pd.to_datetime(df['ReleaseDate'], format='%d/%m/%y')

# Extract day, month, and year
df['Release Day'] = df['ReleaseDate'].dt.day
df['Release Month'] = df['ReleaseDate'].dt.month
df['Release Year'] = df['ReleaseDate'].dt.year

# Drop the original 'Release Date' column
df.drop(columns=['ReleaseDate'], inplace=True)


In [15]:
df

Unnamed: 0,Title,Genre,Budget,Popularity,Duration,Director,Actor1,Actor2,Language,Certification,State,Box Office Collection (INR),Population,Urbanization,TheaterCount,Release Day,Release Month,Release Year
0,Baaghi 3,Action,60000000,7.0,140,10,82,91,Hindi,U/A,Maharashtra,1370000000,10.0,75,1500,6,3,2020
1,Good Newwz,Comedy,45000000,7.5,132,90,7,33,Hindi,U,Punjab,3180000000,9.0,78,1400,27,12,2019
2,Tanhaji,Action,70000000,8.0,135,75,5,81,Hindi,U,Maharashtra,3670000000,11.0,70,1600,10,1,2020
3,Angrezi Medium,Comedy,40000000,7.2,140,41,24,72,Hindi,U/A,Delhi,107200000,9.0,75,1400,13,3,2020
4,Shakuntala Devi,Drama,65000000,7.4,127,20,87,30,Hindi,U/A,West Bengal,20000000,14.0,60,1300,31,7,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,Sivakumarin Sabadham,"Comedy, Drama",25000000,6.8,120,99,74,60,Tamil,U/A,Tamil Nadu,30000000,77.0,48,1200,10,3,2021
191,Maanadu,"Action, Sci-Fi",50000000,7.8,150,132,73,79,Tamil,U/A,Tamil Nadu,80000000,77.0,48,1800,25,11,2021
192,Puducherry,Drama,25000000,6.9,130,99,32,52,Tamil,U/A,Tamil Nadu,30000000,77.0,48,1400,14,1,2022
193,Bheemaa,"Action, Drama",40000000,7.5,150,40,94,53,Tamil,U/A,Tamil Nadu,60000000,77.0,48,1600,7,10,2022


In [17]:
from sklearn.preprocessing import MinMaxScaler
import pickle
import joblib

# Only scale the required numerical columns
numerical_columns = ['Actor1', 'Actor2', 'Director','Budget', 'TheaterCount', 'Popularity','Duration']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply scaling to the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Save the scaler (optional)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [19]:
# Save the processed DataFrame to a CSV file
df.to_csv('boxoffice_processed v1.csv', index=False)


## Expalanatory Data Analysis

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the initial dataset for EDA and other operations
file_path_eda = 'boxoffice1.csv'
df3 = pd.read_csv(file_path_eda)

# Load the processed dataset for model prediction
file_path_model = 'boxoffice_processed v1.csv'
df_model = pd.read_csv(file_path_model)

# Load the saved label encoders and scaler
with open('label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

df3['Budget_Bins'] = pd.cut(df3['Budget'], bins=10)
budget_counts = df3['Budget'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
sns.barplot(x=budget_counts.index.astype(str), y=budget_counts.values)
plt.title('Distribution of Budget')
plt.xlabel('Budget')
plt.ylabel('Box Office Collection (INR)')
plt.xticks(rotation=45)
plt.savefig('static/images/boxoffice_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.histplot(df3['Popularity'], bins=30, kde=True)
plt.title('Distribution of Popularity')
plt.xlabel('Popularity')
plt.ylabel('Frequency')
plt.savefig('static/images/popularity_distribution.png')
plt.close()

numeric_data = df3.select_dtypes(include=[float, int])
plt.figure(figsize=(12, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap')
plt.savefig('static/images/correlation_heatmap.png')
plt.close()

plt.figure(figsize=(12, 8))
movies_by_state = df3['State'].value_counts()
plt.pie(movies_by_state, labels=movies_by_state.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Movies by State')
plt.axis('equal')
plt.savefig('static/images/movies_by_state.png')
plt.close()

plt.figure(figsize=(14, 8))
genre_boxoffice = df3.groupby('Genre')['Box Office Collection (INR)'].mean().sort_values(ascending=False)
sns.barplot(x=genre_boxoffice.index, y=genre_boxoffice.values, palette='viridis')
plt.title('Average Box Office Collection by Genre')
plt.xlabel('Genre')
plt.ylabel('Box Office Collection (INR)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('static/images/genre_by_collection.png')
plt.close()

# Apply label encoding to categorical columns using the loaded encoders
X1 = df_model[['Actor1', 'Actor2', 'Director', 'Budget', 'TheaterCount', 'Popularity', 'Duration']]
y = df_model['Box Office Collection (INR)']

for col in ['Actor1', 'Actor2', 'Director']:
    if X1[col].dtype == 'object':
        X1[col] = X1[col].apply(lambda x: x if x in label_encoders[col].classes_ else 'Unknown')
        X1[col] = label_encoders[col].transform(X1[col])

# Apply scaling using the loaded scaler
X1_scaled = scaler.transform(X1)

# Split the data into training and testing sets
X1_train, X1_test, y_train, y_test = train_test_split(X1_scaled, y, test_size=0.2, random_state=42)

# Train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X1_train, y_train)

# Make predictions
y_pred = model.predict(X1_test)


# --- Histogram of Actual vs Predicted ---
plt.figure(figsize=(12, 6))
sns.histplot(y_test, bins=30, kde=True, color='blue', label='Actual', alpha=0.6)
sns.histplot(y_pred, bins=30, kde=True, color='red', label='Predicted', alpha=0.6)
plt.title('Distribution of Actual vs Predicted Box Office Collections')
plt.xlabel('Box Office Collection (INR)')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.savefig('static/images/actual_vs_predicted_histogram.png')
plt.close()

# --- Model Performance Comparison Plot ---
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, label='Predicted vs Actual', color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, label='Perfect Fit')
plt.title('Model Performance Comparison')
plt.xlabel('Actual Box Office Collection (INR)')
plt.ylabel('Predicted Box Office Collection (INR)')
plt.legend()
plt.tight_layout()
plt.savefig('static/images/model_performance_comparison.png')
plt.close()

# --- Evaluation ---
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=genre_boxoffice.index, y=genre_boxoffice.values, palette='viridis')


Mean Squared Error: 9.259390750805715e+17
R-squared: 0.7714393054203155


## Model Building

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib

# Load dataset
df = pd.read_csv('boxoffice_processed v1.csv')  # Load your data here

# Define features and target
X = df[['Actor1', 'Actor2', 'Director', 'Budget', 'TheaterCount', 'Popularity','Duration']]
y = df['Box Office Collection (INR)']

# Splitting the data into training and testing sets (no scaling required)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define multiple models including Random Forest
models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(alpha=0.1),
    'Ridge Regression': Ridge(alpha=1.0),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return mae, mse, rmse, r2

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    mae, mse, rmse, r2 = evaluate_model(model, X_test, y_test)
    print(f"{name} Evaluation:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")
    print('-' * 50)

# Save the best performing model, scaler, and encoders
best_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Example: Random Forest as best model
best_model.fit(X_train, y_train)

# Save the Random Forest model
joblib.dump(best_model, 'random_forest_model v1.pkl')


Linear Regression Evaluation:
Mean Absolute Error (MAE): 582669846.6359
Mean Squared Error (MSE): 1446573180326179328.0000
Root Mean Squared Error (RMSE): 1202735706.7644
R-squared (R2): 0.6429
--------------------------------------------------
Lasso Regression Evaluation:
Mean Absolute Error (MAE): 582669846.4977
Mean Squared Error (MSE): 1446573182706900480.0000
Root Mean Squared Error (RMSE): 1202735707.7542
R-squared (R2): 0.6429
--------------------------------------------------
Ridge Regression Evaluation:
Mean Absolute Error (MAE): 584789818.6799
Mean Squared Error (MSE): 1845351189041702912.0000
Root Mean Squared Error (RMSE): 1358437039.0422
R-squared (R2): 0.5445
--------------------------------------------------
Gradient Boosting Evaluation:
Mean Absolute Error (MAE): 478869126.9307
Mean Squared Error (MSE): 1293329483702541312.0000
Root Mean Squared Error (RMSE): 1137246448.0941
R-squared (R2): 0.6808
--------------------------------------------------
Random Forest Evaluati

['random_forest_model v1.pkl']

In [27]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [28]:
print(type(model)) 

<class 'sklearn.ensemble._forest.RandomForestRegressor'>
