## 1. Load the packages

In [1]:
# Data processing packages
import numpy as np
import pandas as pd
from collections import Counter

# Machine learning packages
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.pipeline import Pipeline
import torch

# Visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# Others
import time

## 2. Read the data

In [2]:
X1 = pd.read_csv("X1.csv")
Y1 = pd.read_csv("Y1.csv", header=None, names=['revenue '])
# X2 = pd.read_csv("X2.csv")

## 3. Data Preprocessing

### STEP 1: Data Cleaning (columns drop off and missing value processing)

In [3]:
def data_cleaning_process(df):
    """
    This function will drop columns, like "Unnamed: 0", "title", "img_url", "description" from the dataset, and replace the missing value in `runtime` column with a median value, and replace the missing value in `genres` with "Others".
    :param df: A dataframe (X1 or X2)
    :return: A new cleaned dataframe
    """

    new_df = df.copy()
    # missing value for runtime: replace "\\N" with median value
    median_runtime = np.median(new_df.loc[new_df['runtime'] != '\\N', 'runtime'].astype(np.int64))
    new_df['runtime'] = np.where(new_df['runtime'] == '\\N', median_runtime, new_df['runtime']).astype(np.int64)

    # missing value for genres: replace "\\N" with "Others"
    new_df.loc[new_df['genres'] == "\\N", "genres"] = "Others"

    # drop "Unnamed: 0", "title", "img_url", "description"
    new_df = new_df.drop(["Unnamed: 0", "title", "img_url", "description"], axis=1)

    return new_df

In [4]:
# X1_cleaned = data_cleaning_process(X1)

### STEP 2: Data Type Split (Numerical, Categorical, Embeddings)

In [5]:
def data_type_split(df):
    """
    This function will split the whole dataset into different sub dataset according to the data types of the columns
    :param df: A dataframe
    :return: three datadrames, which are numerical, categorical, embeddings
    """

    new_df = df.copy()
    numeric_features = new_df.select_dtypes(include="number").columns.tolist()
    non_numeric_features = new_df.select_dtypes(exclude="number").columns.tolist()
    embedding_features = ['img_embeddings', 'text_embeddings']
    numeric_features.remove('is_adult')
    categorical_features = non_numeric_features.copy()
    [categorical_features.remove(col) for col in embedding_features]
    categorical_features.append('is_adult')
    return new_df.loc[:, numeric_features], new_df.loc[:, categorical_features], new_df.loc[:, embedding_features]

In [6]:
# df_num, df_cat, df_emb = data_type_split(X1_cleaned)

In [7]:
# df_num

### STEP 3: Categorical Columns Processing (Genres --> multilable binary type, Studio --> studio_frequency)

In [8]:
dict_cat_freq = torch.load("studio_freq")

In [9]:
def categorical_process(df):
    """
    This function will process on `genres` and `studio` columns.
    `genres` will be transformed to multilabel binary variables;
    `studio` will be transformed to a frequency type.
    :param df: A categorical datframe
    :return: A new dataframe
    """

    new_df = df.copy()

    # processing on `genres` column
    new_df['genres_split'] = new_df['genres'].apply(lambda x: x.split(","))
    mlb = MultiLabelBinarizer()
    genere_encoder_df = pd.DataFrame(mlb.fit_transform(new_df['genres_split']))
    genere_encoder_df.columns = mlb.classes_.tolist()

    # processing on `studio` column
    studio_freq_df = pd.DataFrame(new_df['studio'].apply(lambda x: dict_cat_freq[x] if x in dict_cat_freq.keys() else min(dict_cat_freq.values())))
    studio_freq_df.columns = ['studio_freq']


    processed_cat_df = pd.concat([genere_encoder_df, studio_freq_df, new_df['is_adult']], axis=1)

    return processed_cat_df

In [10]:
# categorical_process(df_cat)

### STEP 4: Embedding Column Processing

In [11]:
def listToDF(df, column_name):

    new_df = []
    for row in df[column_name]:
        ls = []
        row = eval(row)
        for each in row:
            ls.append(each)
        new_df.append(ls)

    new_df = pd.DataFrame(new_df)

    return new_df

In [12]:
def embedding_process(df):
    new_df = df.copy()

    # image embeddings
    img_emb_df = listToDF(new_df, 'img_embeddings')
    text_emb_df = listToDF(new_df, 'text_embeddings')

    processed_emb_df = pd.concat([img_emb_df, text_emb_df], axis=1)
    return processed_emb_df

In [13]:
# embedding_process(X1.loc[:, ['img_embeddings', 'text_embeddings']])

### STEP 5: Combine Everything

In [14]:
def data_combine(df_num, df_cat, df_emb):
    new_df = pd.concat([df_num, df_cat, df_emb], axis=1)
    return new_df

In [15]:
# df_processed = data_combine(df_num, categorical_process(df_cat), df_emb)

In [16]:
# df_processed

### STEP 6: Normalization and Standarization

In [17]:
def data_scaling(df):
    """
    This function will process on the numercial columns.
    For `ratings`, we will use normalization
    For the other columns, we will use standardization
    :param df:
    :return:
    """
    new_X = df.copy().to_numpy()

    # df_norm = new_df["ratings"]
    # df_stad = new_df.iloc[:, 1:]
    scaler_norm = MinMaxScaler().fit(new_X[:, 0].reshape([-1, 1]))
    scaler_stad = StandardScaler().fit(new_X[:, 1:5])
    new_X[:, 0] = scaler_norm.transform(new_X[:, 0].reshape([-1, 1])).ravel()
    new_X[:, 1:5] = scaler_stad.transform(new_X[:, 1:5])

    return new_X

In [18]:
# data_scaling(df_processed)

### Construct Data Engineering Pipeline

In [19]:
def DataEngineering(df):
    df_cleaned = data_cleaning_process(df)
    df_num, df_cat, df_emb = data_type_split(df_cleaned)
    df_cat_processed = categorical_process(df_cat)
    df_emb_processed = embedding_process(df_emb)
    df_processed = data_combine(df_num, df_cat_processed, df_emb_processed)
    X_ready = data_scaling(df_processed)

    return X_ready

In [20]:
preprocess_transformer = FunctionTransformer(DataEngineering)

In [21]:
p1 = Pipeline([
    ('Preprocessor', preprocess_transformer)
])

In [22]:
X1_ready = p1.fit_transform(X1)

In [23]:
X1_ready

array([[ 0.60674157,  0.80919972,  1.11135356, ...,  0.99988544,
        -0.49546754,  0.95906293],
       [ 0.76404494, -0.27177631, -0.07338902, ...,  0.9998001 ,
        -0.5089115 ,  0.9546743 ],
       [ 0.53932584, -0.25625834, -1.59662947, ...,  0.9999524 ,
        -0.45857945,  0.9761356 ],
       ...,
       [ 0.73033708, -0.27150875,  0.34973333, ...,  0.99994576,
        -0.3214418 ,  0.97996914],
       [ 0.71910112,  0.04658617,  0.77285568, ...,  0.9999413 ,
        -0.3090013 ,  0.9727902 ],
       [ 0.33707865, -0.24519569,  0.26510886, ...,  0.9998354 ,
        -0.75486994,  0.9661582 ]])