### Data loading and preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import svm
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [None]:
#Load data
df = pd.read_json(r'C:\Code\ML_Project\Data\preprocessed_movieDB.json')
df=df[df['revenue']<100*1e+6]

In [None]:
# Creating binning
bin_size =20
df['revenue_bin'] = np.floor(df['revenue'] / (bin_size*1e+6 )).astype(int)

In [None]:
#Create feature and class target
X = df.drop(['title', 'revenue', 'revenue_bin'], axis=1)
y = df['revenue_bin']

In [None]:
#Train, validation, test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
# Standard scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Finetuning with gridsearch

In [None]:
model = svm.SVC(kernel = 'rbf', random_state=42)

In [None]:
param_grid = {
    'C': [0.1,1., 10, 20 ,100, 1000],
    'gamma': [0.001, 0.0001, 0.01, 0.1,0.3, 1]
}

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_weighted', verbose=2)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
final_model = grid_search.best_estimator_
scores = cross_val_score(final_model, X_train, y_train, cv=5, scoring='f1_weighted')
scores

### Using with hyperparameter

In [None]:
model = svm.SVC(kernel='rbf',C = 100, gamma = 0.3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
print(scores)
print(scores.mean())

In [None]:
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

In [None]:
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
plt.imshow(confusion_matrix, interpolation='nearest', cmap='Blues')
plt.title('Confusion Matrix')
plt.colorbar()

classes = ['1', '2', '3', '4', '5']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = confusion_matrix.max() / 2.
for i in range(confusion_matrix.shape[0]):
    for j in range(confusion_matrix.shape[1]):
        plt.text(j, i, format(confusion_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if confusion_matrix[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()

In [None]:
print(classification_report(y_test, y_pred, zero_division=0))