# Exploratory Data Analysis of [Full IMDb Movies Data](https://www.kaggle.com/datasets/anandshaw2001/imdb-data)

Install all necessary dependencies using pip

In [None]:
%pip install -r requirements.txt

Import needed packages

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from tqdm import tqdm
import joblib
import kagglehub


Download the original dataset from [Kaggle](https://www.kaggle.com/datasets/anandshaw2001/imdb-data)

In [None]:
original_datatest_path = kagglehub.dataset_download("anandshaw2001/imdb-data")
print(f"Original dataset path: {original_datatest_path}")

In [None]:
original_dataset = pd.read_csv(
    f"{original_datatest_path}/Imdb Movie Dataset.csv",
    encoding="utf-8"
)
original_dataset.head()

`vote_average` is the target column. To prepare the dataset for classification of the rating, convert the column to `int`

In [None]:
original_dataset['target'] = original_dataset['vote_average'].apply(int)

Analyze the dataset

In [None]:
original_dataset.info()

Delete all columns containing text attributes in order to continue using standard approaches for tabular data.

In [None]:
original_dataset.drop(
    columns=[
        "title",
        "imdb_id",
        "original_language",
        "original_title",
        "overview",
        "tagline",
        "genres",
        "production_companies",
        "production_countries",
        "spoken_languages",
        "keywords"
    ],
    inplace=True
)

In [None]:
original_dataset.head(10)

Show columns with `nan` values 

In [None]:
original_dataset.isna().sum()

Remove duplicates and omissions

In [None]:
original_dataset.duplicated().sum()

In [None]:
original_dataset.dropna(inplace=True)
original_dataset.drop_duplicates(inplace=True)

In [None]:
original_dataset.info()

## Analysis of numeric features and the target variable.

### Target Variable

Build distribution of the target variable

In [None]:
original_dataset['target'].value_counts()

In [None]:
sns.histplot(data=original_dataset, x='target', stat='proportion')
plt.show()

The target variable contains a serious imbalance. Let's perform a simple balancing of the dataset, reducing the volume of classes 0, 5, 6 to 25,000 objects (since in this laboratory the requirement for the volume of the dataset is >100,000 objects and we can sacrifice the volume in order not to create a large amount of artificial data).

In [None]:
target_counts = [0, 5, 6]
new_indices = []

for target in target_counts:
    indices = np.where(original_dataset["target"] == target)[0]
    sampled_indices = np.random.choice(indices, 25000, replace=False)
    new_indices.extend(sampled_indices)

new_indices = np.array(new_indices)

In [None]:
mask = ~original_dataset["target"].isin(target_counts)
train_data = pd.concat([original_dataset.iloc[new_indices], original_dataset[mask]])
train_data.reset_index(drop=True, inplace=True)

In [None]:
original_dataset["target"].value_counts()

In [None]:
sns.histplot(data=original_dataset, x='target', stat='proportion')
plt.show()

As a result, we have a more uniform distribution of features.

## Feature `adult`

In [None]:
original_dataset["adult"].value_counts()

Convert the feature to `int` type

In [None]:
original_dataset['adult'] = original_dataset['adult'].astype(int)

## Feature `status`

In [None]:
sns.histplot(data=original_dataset, x='status', stat='proportion')
plt.ylim((0,1))
plt.xticks(rotation=45)
plt.show()

Despite the fact that the distribution of the feature is uneven, we will leave it in order to further analyze its significance.

## Feature `release_date`

In [None]:
original_dataset["release_date"].value_counts()

Unifying the date type

In [None]:
original_dataset["release_date"] = pd.to_datetime(original_dataset["release_date"], errors="coerce")
original_dataset["release_date"].isnull().sum()
original_dataset.dropna(inplace=True)

In my opinion, the month and day of the film's release are mostly noise, so let's leave only the sign with the year of the film's release.

In [None]:
original_dataset["release_year"] = original_dataset["release_date"].dt.year
original_dataset.drop(columns="release_date", inplace=True)

# Model selection

In [None]:
#  Divide dataset into a training and a test sample, so that all KFold validation can be done on a small test sample, and the final model can be trained on a training one.
train_dataset, test_dataset = train_test_split(original_dataset, test_size=0.3, random_state=42)

Using the `KFold' (k=10) cross-validation, we select the model. We will use `OneHotEncoding` as a way to encode features.

| Models |
|----------|
| `LogisticRegression` |
| `DecisionTreeClassifier` |
| `RandomForestClassifier` |
| `GradientBoostingClassifier` |


In [None]:
x = test_dataset.drop(columns='target')
y = np.array(test_dataset['target'])

In [None]:
x.shape

In [None]:
models = {
    'LogisticRegression': lambda: LogisticRegression(max_iter=500),
    'DecisionTreeClassifier': DecisionTreeClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier
}

metrics = {}

kf = KFold(n_splits=2, shuffle=True, random_state=42)

encoder_method = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
column_transformer = ColumnTransformer([
    ('ohe', encoder_method, ['status']),
], remainder="passthrough")

for train_index, val_index in kf.split(x):
    x_train, y_train = x.iloc[train_index, :], y[train_index]
    x_val, y_val = x.iloc[val_index, :], y[val_index]

    for model_name, model_creator in models.items():
        pipeline = Pipeline(steps=[
            ('ohe', column_transformer),
            ('scaling', StandardScaler()),
            ('model', model_creator())
        ])

        pipeline.fit(x_train, y_train)
        f1 = f1_score(y_val, pipeline.predict(x_val), average='weighted')

        if model_name not in metrics:
            metrics[model_name] = []
        metrics[model_name].append(f1)

In [None]:
for experiment in metrics:
    metrics[experiment]=np.mean(metrics[experiment])
pd.DataFrame.from_dict(metrics, orient='index').rename(columns={0: 'f1 score'}).sort_values(by='f1 score', ascending=False)

We will use 2 best models: `GradientBoostingClassifier` Ð¸ `DecisionTreeClassifier`.

# Feature selection

In [None]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ['status']),
], remainder="passthrough")
column_transformer.fit(x, y)
x = pd.DataFrame(column_transformer.transform(x), columns=column_transformer.get_feature_names_out())

In [None]:
x.shape

At the moment, we have quite a large number of signs. Let's try to select the most important features by the weights of the logistic regression.

In [None]:
pipeline = Pipeline(steps=[
    ('scaling', StandardScaler()),
    ('regression', LogisticRegression(max_iter=500))
])
pipeline.fit(x, y)
coef = pipeline.steps[1][1].coef_[0]

We visualize the absolute values of the weights of the logistic regression.

In [None]:
sns.barplot(data=pd.DataFrame.from_dict({'importance': np.abs(coef), 'feature': x.columns}), x='feature', y='importance')
plt.xticks(rotation=90)
plt.show()

We will select features that have $importance > C$, where $C$ is some kind of threshold. Using the `k-Fold' (k=10) cross-validation, we will find such a threshold.

In [None]:
f1 = {i: [] for i in sorted(np.abs(coef))[:-1]}
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for train_index, val_index in tqdm(kf.split(x)):
    for C in f1:
        X_train = x.iloc[train_index, np.abs(coef) > C]
        y_train = y[train_index]
        X_val = x.iloc[val_index, np.abs(coef) > C]
        y_val = y[val_index]

        pipeline = Pipeline(steps=[
            ('scaling', StandardScaler()),
            ('regression', LogisticRegression(max_iter=500, solver='lbfgs'))
        ])
        pipeline.fit(X_train, y_train)

        f1[C].append(f1_score(y_val, pipeline.predict(X_val), average='weighted'))

for C in f1:
    f1[C] = np.mean(f1[C])

f1 = pd.DataFrame.from_dict(f1, orient='index').reset_index().rename(columns={'index': 'C', 0: 'f2-score'})

print(f1)

In [None]:
sns.lineplot(data=f1, x='C', y='f2-score')
plt.show()

It can be seen that at $0<C<0.3$ chaotic fluctuations occur. Let's take the threshold of $C=0.3$, after which the quality of the model begins to deteriorate.

In [None]:
x = x.iloc[:, np.abs(coef)>0.3]

In [None]:
x.columns

# Model training

Since we have selected the best models and selected the features, we can see what quality we get now.

In [None]:
models = {
    'DecisionTreeClassifier': DecisionTreeClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier
}

metrics = {}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, val_index in kf.split(x):
    for model in models:
        X_train = x.iloc[train_index, :]
        y_train = y[train_index]
        X_val = x.iloc[val_index, :]
        y_val = y[val_index]

        pipeline = Pipeline(steps=[
            ('scaling', StandardScaler()),
            ('classifier', models[model]())
        ])
        pipeline.fit(X_train, y_train)

        if model not in metrics:
            metrics[model] = []
        
        f1 = f1_score(y_val, pipeline.predict(X_val), average='weighted')
        metrics[model].append(f1)

In [None]:
for experiment in metrics:
    metrics[experiment]=np.mean(metrics[experiment])
pd.DataFrame.from_dict(metrics, orient='index').rename(columns={0: 'f1 score'}).sort_values(by='f1 score', ascending=False)

# We are training the final GradientBoostingClassifier model, which proved to be the best

In [None]:
final_train_dataset, final_test_dataset = train_test_split(train_dataset, test_size=0.2, random_state=42)

x_train = final_train_dataset.drop(columns='target')
y_train = np.array(final_train_dataset['target'])

x_test = final_test_dataset.drop(columns='target')
y_test = np.array(final_test_dataset['target'])

In [None]:
x_train.head()

In [None]:
encoder_method = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
column_transformer = ColumnTransformer([
    ('ohe', encoder_method, ['status']),
], remainder="passthrough")
column_transformer.fit(x_train, y_train)
x_train = pd.DataFrame(column_transformer.transform(x_train), columns=column_transformer.get_feature_names_out())

x_test = pd.DataFrame(column_transformer.transform(x_test), columns=column_transformer.get_feature_names_out())

In [None]:
x_train.head()

In [None]:
columns_to_remove = ["ohe__status_In Production", "ohe__status_Planned", "ohe__status_Rumored", "remainder__runtime", "remainder__adult", "remainder__release_year"]
x_train.drop(columns=columns_to_remove, inplace=True)
x_test.drop(columns=columns_to_remove, inplace=True)

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
classifier = GradientBoostingClassifier()

classifier.fit(x_train_scaled, y_train)

In [None]:
import os

encoder_filename = 'model/encoder.joblib'
scaler_filename = 'model/scaler.joblib'
model_filename = 'model/classifier.joblib'

if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(column_transformer, encoder_filename)
joblib.dump(scaler, scaler_filename)
joblib.dump(classifier, model_filename)

In [None]:
f1 = f1_score(y_test, classifier.predict(x_test_scaled), average='weighted')
print(f"F1 Score: {f1}")