In [166]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [167]:
df = pd.read_csv("../data/Film_Dataset.csv")

In [168]:
df.head()

Unnamed: 0,Film_Name,Release_Date,Category,Language,Viewer_Rate,Number_of_Views,Viewing_Month
0,Chennai Express,9/12/2021,Romance,Hindi,4.5,36395,2022-09
1,Mountain Trail,4/27/2020,Comedy,English,3.5,93162,2021-07
2,Eternal Hope,7/4/2020,Romance,English,4.6,98663,2021-04
3,Seoul Heartbeat,4/17/2020,Comedy,English,4.1,112635,2020-07
4,Shadow Pact,2/24/2022,Documentary,English,4.3,29496,2023-05


In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Film_Name        460 non-null    object 
 1   Release_Date     460 non-null    object 
 2   Category         460 non-null    object 
 3   Language         460 non-null    object 
 4   Viewer_Rate      460 non-null    float64
 5   Number_of_Views  460 non-null    int64  
 6   Viewing_Month    460 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 25.3+ KB


In [170]:
df.duplicated().any()

np.False_

In [None]:
# Convert dates
df['Release_Date'] = pd.to_datetime(df['Release_Date'])
df['Viewing_Month'] = pd.to_datetime(df['Viewing_Month'])

# Create features
df['Release_Year'] = df['Release_Date'].dt.year
df['Release_Month'] = df['Release_Date'].dt.month
df['Viewing_Year'] = df['Viewing_Month'].dt.year


df['Movie_Age'] = 2025 - df['Release_Year']
df['Month_Number'] = df['Viewing_Month'].dt.month

# df['Age_at_View'] = (df['Viewing_Month'] - df['Release_Date']).dt.days
# df['Age_at_View'] = df['Age_at_View'] / 30   # convert to months

#df['Rating_Age_Interaction'] = df['Viewer_Rate'] * df['Age_at_View']


In [172]:
df.head()

Unnamed: 0,Film_Name,Release_Date,Category,Language,Viewer_Rate,Number_of_Views,Viewing_Month,Release_Year,Release_Month,Movie_Age,Month_Number
0,Chennai Express,2021-09-12,Romance,Hindi,4.5,36395,2022-09-01,2021,9,4,9
1,Mountain Trail,2020-04-27,Comedy,English,3.5,93162,2021-07-01,2020,4,5,7
2,Eternal Hope,2020-07-04,Romance,English,4.6,98663,2021-04-01,2020,7,5,4
3,Seoul Heartbeat,2020-04-17,Comedy,English,4.1,112635,2020-07-01,2020,4,5,7
4,Shadow Pact,2022-02-24,Documentary,English,4.3,29496,2023-05-01,2022,2,3,5


In [173]:
df_model = pd.get_dummies(df, columns=['Category', 'Language'], drop_first=True)

In [174]:
X = df_model.drop(['Number_of_Views', 'Film_Name', 'Viewing_Month', 'Release_Date'], axis=1)
y = df_model['Number_of_Views']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [175]:
df_model.head()

Unnamed: 0,Film_Name,Release_Date,Viewer_Rate,Number_of_Views,Viewing_Month,Release_Year,Release_Month,Movie_Age,Month_Number,Category_Comedy,...,Language_French,Language_Hindi,Language_Japanese,Language_Korean,Language_Malayalam,Language_Pakistani,Language_Sinhala,Language_Spanish,Language_Taiwanese,Language_Tamil
0,Chennai Express,2021-09-12,4.5,36395,2022-09-01,2021,9,4,9,False,...,False,True,False,False,False,False,False,False,False,False
1,Mountain Trail,2020-04-27,3.5,93162,2021-07-01,2020,4,5,7,True,...,False,False,False,False,False,False,False,False,False,False
2,Eternal Hope,2020-07-04,4.6,98663,2021-04-01,2020,7,5,4,False,...,False,False,False,False,False,False,False,False,False,False
3,Seoul Heartbeat,2020-04-17,4.1,112635,2020-07-01,2020,4,5,7,True,...,False,False,False,False,False,False,False,False,False,False
4,Shadow Pact,2022-02-24,4.3,29496,2023-05-01,2022,2,3,5,False,...,False,False,False,False,False,False,False,False,False,False


In [176]:
model = RandomForestRegressor(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

In [177]:
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 11410.254710144925
R2 Score: 0.7672728934379963


In [178]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

search = RandomizedSearchCV(
    model,
    param_grid,
    n_iter=10,
    cv=5,
    scoring='r2'
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

In [179]:
y_pred = best_model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 11263.339836791505
R2 Score: 0.7721612648340849


In [180]:
december_data = df_model[df_model['Month_Number'] == 12]
december_data['Predicted_Views'] = model.predict(december_data[X.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  december_data['Predicted_Views'] = model.predict(december_data[X.columns])


In [181]:
top_december = december_data[['Film_Name', 'Predicted_Views']].sort_values(
    by='Predicted_Views', ascending=False
).head(10)
top_december.shape
print(top_december)

            Film_Name  Predicted_Views
339       Mirror Maze    176198.683333
79   The King's Guard    165967.923333
84       Kandy Dreams    159321.140000
132  Tech Singularity    158023.093333
288       Mirror Maze    156027.676667
353   Chennai Express    153795.190000
381      Eternal Hope    153041.196667
35     A Summer Dream    152145.783333
111       Future City    150381.836667
86        Future City    148556.013333
