# Preliminaries

### Imports

In [177]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import numpy as np

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing

### Preprocessing

Reading the combined data

In [178]:
df = pd.read_csv('CleanedData/dataset.csv')
print(df.columns)

Transforming features into usable data types

In [179]:
# Currently the target is in the form of a float, we need to convert it to a categorical variable
# such that 0 is bad, 1 is neutral, and 2 is good

df['averageRating'] = np.where((df['averageRating'] >= 0) & (df['averageRating'] <= 3.33), 0,
                      np.where((df['averageRating'] > 3.33) & (df['averageRating'] <= 6.66), 1, 2))


EDA & Data Cleaning

1. Removing the isAdult feature

In [None]:
df['isAdult'].unique()

In [None]:
sns.histplot(data=df, x='isAdult')
plt.xlabel('isAdult')
plt.ylabel('Count')
plt.title('Histogram of isAdult')

for p in plt.gca().patches:
    plt.gca().annotate(f"{p.get_height()}", (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='bottom')

plt.show()

In [180]:
# Remove the isAdult column from the dataset since there are only 4 adult movies in the dataset
df = df.drop(columns=['isAdult'])

Checking the releaseYear values

In [None]:
sns.boxplot(data=df, x='releaseYear')
plt.title('Boxplot of Release Year')
plt.xlabel('Release Year')
plt.show()

Checking the runtimeMinutes values

In [None]:
sns.boxplot(data=df, x='runtimeMinutes')
plt.title('Boxplot of Runtime Minutes')
plt.xlabel('Runtime Minutes')
plt.show()

In [None]:
below_50 = df[df['runtimeMinutes'] < 50].shape[0]
over_240 = df[df['runtimeMinutes'] > 240].shape[0]

print(f"Number of samples below 50: {below_50}")
print(f"Number of samples over 240: {over_240}")

In [None]:
df = df[(df['runtimeMinutes'] >= 50) & (df['runtimeMinutes'] <= 240)]

In [None]:
del below_50
del over_240

Checking the bincounts of the categorical features

In [None]:
genres_exploded = df['genres'].str.get_dummies(sep=',')
genres_counts = genres_exploded.sum().sort_values(ascending=False)

# Plot the genres distribution
plt.figure(figsize=(12, 8))
sns.barplot(x=genres_counts.values, y=genres_counts.index, palette='viridis')
plt.title('Distribution of Movie Genres')
plt.xlabel('Number of Movies')
plt.ylabel('Genres')
plt.show()

In [None]:
# print the number of samples marked as news, adult or film-noir, from the genres_exploded df
print(genres_exploded['Musical'].sum())
print(genres_exploded['Western'].sum())
print(genres_exploded['News'].sum())
print(genres_exploded['Adult'].sum())
print(genres_exploded['Film-Noir'].sum())

Labeling / encoding

In [182]:
# Label encode the studio column
label_encoder = LabelEncoder()
df['studio'] = label_encoder.fit_transform(df['studio'])

In [183]:
df['genres'] = df['genres'].str.split(',')
df = pd.concat([df, df['genres'].str.join('|').str.get_dummies()], axis=1)
df.drop('genres', axis=1, inplace=True)

Scaling

In [184]:
scaler = MinMaxScaler()
columns_to_scale = ['releaseYear', 'runtimeMinutes', 'studio', 'lifetimeGross',
                    'nrOfLanguages', 'director_nrOfMovies', 'nrOfEmployees',
                    'nrOfActors', 'numVotes', 'sameYearTotalMoviesReleased']

df_scaled = df.copy()

df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

In [185]:
df_scaled.to_csv('Data/dataset_preprocessed.csv', index=False)