# Preliminaries

### Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import numpy as np

### Functions

# Preprocessing and dimensionality reduction

### Preprocessing

Reading the combined data

In [2]:
df = pd.read_csv('CleanedData/dataset.csv')

Transforming features into usable data types

In [3]:
df['averageRating'] = np.where((df['averageRating'] >= 0) & (df['averageRating'] <= 3.33), 0,
                      np.where((df['averageRating'] > 3.33) & (df['averageRating'] <= 6.66), 1, 2))

Labeling / encoding

In [4]:
# Label encode the studio column
label_encoder = LabelEncoder()
df['studio'] = label_encoder.fit_transform(df['studio'])

In [5]:
df['genres'] = df['genres'].str.split(',')
df = pd.concat([df, df['genres'].str.join('|').str.get_dummies()], axis=1)
df.drop('genres', axis=1, inplace=True)

Scaling

In [6]:
scaler = MinMaxScaler()
columns_to_scale = ['releaseYear', 'runtimeMinutes', 'studio', 'lifetimeGross',
                    'nrOfLanguages', 'director_nrOfMovies', 'nrOfEmployees',
                    'nrOfActors', 'numVotes', 'sameYearTotalMoviesReleased']

df_scaled = df.copy()

df_scaled[columns_to_scale] = scaler.fit_transform(df_scaled[columns_to_scale])

In [7]:
df_scaled.to_csv('CleanedData/dataset_scaled.csv', index=False)

### EDA

In [8]:
df_preprocessed = df_scaled.drop(columns=['isAdult'])
# Remove the isAdult column from the dataset since there are only 4 adult movies in the dataset