In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('data_movies_v3.csv')


In [None]:
df.head()

In [None]:
scaler = MinMaxScaler()
# Scaling  the columns
df[['revenue', 'popularity']] = scaler.fit_transform(df[['revenue', 'popularity']])

In [None]:
# Filtering and  selecting the columns
df_for_model = df[(df[['budget','popularity','revenue', 'runtime','vote_average', 'vote_count', 'sentiment','subjective']] != 0).all(axis=1)]
df_for_model = df_for_model.loc[:, ['budget','popularity','revenue', 'runtime','vote_average', 'vote_count','Drama','Comedy', 'Thriller', 'Action', 'Romance', 'Adventure', 'Crime','Science Fiction', 'Horror', 'Family', 'Fantasy', 'Mystery','Animation', 'History', 'Music', 'War', 'Documentary', 'Western','Foreign', 'num_of_production_companies', 'sentiment','subjective']]



In [None]:
# Created three equally-sized bins for the 'popularity' column
bins = pd.qcut(df_for_model['popularity'], q=3, labels=[1, 2, 3])
df_for_model['popularity'] = bins

In [None]:
X = df_for_model.drop(['popularity','revenue'], axis=1)
y = df_for_model['popularity']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling input features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#SVM with clas weights
svm = SVC(class_weight='balanced')
from sklearn.model_selection import GridSearchCV
# hyperparameter values to search
params = {'C': [0.1, 1, 10],
          'kernel':['linear', 'rbf'],
          'class_weight':['balanced', None]}
#GridSearchCV object to search for the best combination of hyperparameters
grid = GridSearchCV(svm, param_grid=params, scoring='f1_weighted', cv=5)
grid.fit(X_train, y_train)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
#accuracies
accuracy_svm = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy_svm)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Plot the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

Multinomial Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

X = df_for_model.drop(['popularity', 'revenue'], axis=1)
y = df_for_model['popularity']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
constant_shift = 1
X_train = X_train + constant_shift
X_test = X_test + constant_shift
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

# predictions
y_pred = naive_bayes.predict(X_test)


accuracy_nb = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy_nb)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Plot the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
models = ['SVM', 'Naive Bayes']
accuracies = [accuracy_svm, accuracy_nb]

# Create a bar chart
plt.figure(figsize=(8, 5))
plt.bar(models, accuracies, color=['blue', 'green'])

# Add labels and a title
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison: SVM vs. Naive Bayes')

# Display the accuracy values above the bars
for i, accuracy in enumerate(accuracies):
    plt.text(i, accuracy, f'{accuracy:.2f}', ha='center', va='bottom', fontsize=12)

# Show the plot
plt.show()