In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!cp -r /content/drive/MyDrive/Colab_Notebooks/Computer_Programming/project_model_training/Data .

In [3]:
import torch

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer # treat only specific columns like encode this and not that
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cuda')

In [7]:
df = pd.read_csv('Data/movie_data_w_star_power.csv')

  df = pd.read_csv('Data/movie_data_w_star_power.csv')


In [8]:
X = df[['budget','release_year','release_month','runtime','certificate','star_power','director_power']]
y = df['movie_class']

In [9]:
len(df['movie_class'].unique())

9

In [10]:
numeric_min_max = ['budget', 'director_power', 'star_power']
numeric_standard = ['release_month', 'release_year']
categorical_features = ['certificate']

numeric_transformer_imputer = Pipeline([
    ('imputer', IterativeImputer(random_state=0)),
    ('scaler_standard', StandardScaler())
])
numeric_transformer_min_max = Pipeline([
    ('scaler_min_max', MinMaxScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Not Rated')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num1', numeric_transformer_imputer, numeric_standard),
        ('cat', categorical_transformer, categorical_features),
        ('num2', numeric_transformer_min_max, numeric_min_max),
    ])



In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
X_train1 = np.array(preprocessor.fit_transform(X_train).toarray(), dtype='float32')
X_test1 = np.array(preprocessor.transform(X_test).toarray(), dtype='float32')

array(['below_avg', 'disaster', 'all_time_blockbuster', 'flop',
       'above_average', 'blockbuster', 'hit', 'super_hit', 'average'],
      dtype=object)

In [13]:
from sklearn.preprocessing import OneHotEncoder

oe = OneHotEncoder(sparse_output=False)
y_train1 = oe.fit_transform(y_train.values.reshape(-1, 1))
y_test1 = oe.transform(y_test.values.reshape(-1, 1))

In [38]:
np.unique(np.argmax(y_test1, axis=1))

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [14]:
input_size = X_train1.shape[1]
output_size = 9

In [15]:
# # have to convert numpy to torch tensors.
# # torch tensors are simply numpy version of pytorch
# input = torch.from_numpy(X_train1)
# targets = torch.from_numpy(y_train1)

# type(input), type(targets) # torch.Tensor

# # same thing
# input.size(), targets.size()
# input.shape, targets.shape

In [16]:
# # have to convert numpy to torch tensors.
# # torch tensors are simply numpy version of pytorch
# test_input = torch.from_numpy(X_test1)
# test_targets = torch.from_numpy(y_test1)

# type(test_input), type(test_targets) # torch.Tensor

# # same thing
# print(test_input.size(), test_targets.size())
# print(test_input.shape, test_targets.shape)

In [17]:
# # define a dataset so that dataloaders understand.
# from torch.utils.data import TensorDataset

# train_dataset = TensorDataset(input, targets)
# test_dataset = TensorDataset(test_input, test_targets)

# train_dataset[0: 2]

In [18]:
# from torch.utils.data import DataLoader

# batch_size = 2048 # mini-batch of size 3
# train_dl = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=4) # shuffle should be true.
# test_dl = DataLoader(test_dataset, batch_size, num_workers=4) # shuffle should be true.

In [19]:
print(X_train1.shape)
print(y_train1.shape)
print(X_test1.shape)
print(y_test1.shape)

(783296, 36)
(783296, 9)
(195825, 36)
(195825, 9)


In [94]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Assuming 'X_train' and 'y_train' are your training data
oversampler = RandomOverSampler(sampling_strategy='auto')
undersampler = RandomUnderSampler(sampling_strategy='auto')

X_resampled, y_resampled = oversampler.fit_resample(X_train1, y_train1)
# X_resampled, y_resampled = undersampler.fit_resample(X_train1, y_train1)

In [95]:
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.metrics import precision_score, recall_score, f1_score
from keras import backend as K
import tensorflow as tf
from sklearn.utils import class_weight

# Define custom metrics using scikit-learn functions
def precision(y_true, y_pred):
    y_pred = K.argmax(y_pred)
    return precision_score(y_true, y_pred, average='weighted')

def recall(y_true, y_pred):
    y_pred = K.argmax(y_pred)
    return recall_score(y_true, y_pred, average='weighted')

def f1_score_metric(y_true, y_pred):
    y_pred = K.argmax(y_pred).numpy()
    y_true = np.argmax(y_true, axis=1)
    return f1_score(y_true, y_pred, average='weighted')

model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=input_size))
model.add(Dense(256, activation='relu', input_dim=input_size))
model.add(Dense(128, activation='relu', input_dim=input_size))
model.add(Dense(64, activation='relu', input_dim=input_size))
model.add(Dense(9, activation='softmax'))
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# class_weights = class_weight.compute_class_weight('balanced', np.unique(np.argmax(y_train1, axis=1)), np.argmax(y_train1, axis=1))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Recall()], run_eagerly=True)

# hist = model.fit(X_train1, y_train1, validation_data=(X_test1, y_test1), epochs=100, batch_size=20480000, class_weight=dict(enumerate(class_weights)))
hist = model.fit(X_resampled, y_resampled, validation_data=(X_test1, y_test1), epochs=100, batch_size=204800)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [86]:
# import tensorflow as tf
# from keras import backend as K

# def focal_loss(gamma=10.0, alpha=0.55):
#     def focal_loss_fixed(y_true, y_pred):
#         pt = y_pred * y_true + (1 - y_pred) * (1 - y_true)
#         pt = K.clip(pt, K.epsilon(), 1 - K.epsilon())
#         return -K.sum(alpha * K.pow(1 - pt, gamma) * K.log(pt), axis=-1)
#     return focal_loss_fixed

# model = Sequential()
# model.add(Dense(1024, activation='relu', input_dim=input_size))
# model.add(Dense(256, activation='relu', input_dim=input_size))
# model.add(Dense(128, activation='relu', input_dim=input_size))
# model.add(Dense(64, activation='relu', input_dim=input_size))
# model.add(Dense(9, activation='softmax'))

# model.compile(optimizer='adam', loss=focal_loss(), metrics=['accuracy', tf.keras.metrics.Recall()], run_eagerly=True)
# model.fit(X_train1, y_train1, epochs=100, batch_size=20400)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7f09df11c3d0>

In [96]:
prediction = model(X_train1)
predictions = tf.argmax(prediction, axis=1)
yhat_train = predictions.numpy()
y_true_train = np.argmax(y_train1, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_true_train, yhat_train))

              precision    recall  f1-score   support

           0       0.10      0.97      0.18       147
           1       0.09      0.59      0.16      3672
           2       0.11      0.96      0.21        94
           3       1.00      0.97      0.98    667643
           4       0.17      0.59      0.27       370
           5       1.00      0.97      0.98    110709
           6       0.15      0.72      0.25       273
           7       0.18      0.79      0.30       121
           8       0.14      0.72      0.23       267

    accuracy                           0.96    783296
   macro avg       0.33      0.81      0.40    783296
weighted avg       0.99      0.96      0.98    783296



In [97]:
prediction = model(X_test1)
predictions = tf.argmax(prediction, axis=1)
yhat_test = predictions.numpy()
y_true_test = np.argmax(y_test1, axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_true_test, yhat_test))

              precision    recall  f1-score   support

           0       0.02      0.16      0.03        37
           1       0.08      0.54      0.15       918
           2       0.00      0.00      0.00        24
           3       1.00      0.97      0.98    166912
           4       0.07      0.24      0.11        92
           5       1.00      0.97      0.98     27677
           6       0.05      0.26      0.09        68
           7       0.01      0.03      0.01        30
           8       0.04      0.19      0.06        67

    accuracy                           0.96    195825
   macro avg       0.25      0.37      0.27    195825
weighted avg       0.99      0.96      0.98    195825

