# Preprosessing


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
# random split 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
# one hot incoding


one_hot_features = ['relationship', 'race', 'occupation', 'marital-status',
                    'sex', 'workclass']

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[one_hot_features])
X_train_enc_oh = pd.DataFrame(enc.transform(X_train[one_hot_features]).toarray(),
                           columns=enc.get_feature_names())
X_test_enc_oh = pd.DataFrame(enc.transform(X_test[one_hot_features]).toarray(),
                          columns=enc.get_feature_names())

X_train = pd.concat([X_train.drop(columns = one_hot_features), X_train_enc_oh], axis=1)
del X_train_enc_oh

X_test = pd.concat([X_test.drop(columns = one_hot_features), X_test_enc_oh], axis=1)
del X_test_enc_oh


In [None]:
# ordinal incoding

ordinal_features = ['education']
education_order = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th',
                   ' 10th', ' 11th', ' 12th',  ' HS-grad', ' Prof-school', 
                   ' Assoc-acdm', ' Assoc-voc', ' Some-college', ' Bachelors',
                   ' Masters', ' Doctorate']

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[education_order])
encoder.fit(X_train[ordinal_features].values.reshape(-1, 1))
X_train_enc_ord = pd.DataFrame(encoder.transform(X_train[ordinal_features]),
                           columns=ordinal_features)
X_test_enc_ord = pd.DataFrame(encoder.transform(X_test[ordinal_features]),
                          columns=ordinal_features)

In [None]:
# Normalize feature

mue = df[feature].mean()
sigma = df[feature].std()
df[feature] = (df[feature] - mue)/sigma

In [None]:
# Dimensionality Reduction with PCA

from sklearn.decomposition import PCA

def plot_PCA_curve(X_train):
    pca = PCA()
    pca.fit(X_train)
    plt.figure(figsize=(7,7))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.xlabel('cumulative explained variance')

    plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
    plt.xlabel('k (component)')
    plt.title('Percentage of variance explained by given component');


def get_n_pca_components(n, X_train, X_test):
    pca = PCA(n_components=n)
    pca.fit(X_train)
    X_train_t = pca.transform(X_train)
    X_test_t = pca.transform(X_test)
    return X_train_t, X_test_t

In [None]:
import scipy
from tqdm import tqdm

def map_features(df, features_for_embedding, threshold=0.99):
    lookup_dict = {}
    data_mapped = df.copy(deep=True)

    for col in tqdm(features_for_embedding):
        
        # Replace rare values with the string 'OOV'
        normalized_vc = data_mapped[col].value_counts(normalize=True).cumsum()
        vals_to_remove = list(normalized_vc[normalized_vc > threshold].index)
        # print(f'Removing the following values from {col}: {vals_to_remove}')
        data_mapped.loc[data_mapped[col].isin(vals_to_remove), col] = 'OOV' 

        # Create the mapping
        col_mapping = {k: v for k, v in enumerate(data_mapped[col].unique(), start=1) if v != 'OOV'}
        col_mapping[0] = 'OOV'
        # rerank the keys to make sure that we have no missing key:
        ranked_keys = scipy.stats.rankdata(list(col_mapping.keys())) - 1
        reranked_col_mapping = dict(zip(ranked_keys, col_mapping.values()))
        # add corresponding mapped columns
        inverse_col_mapping = {v: k for k, v in reranked_col_mapping.items()}
        data_mapped[f'{col}'] = data_mapped[col].map(inverse_col_mapping)

        lookup_dict[col] = inverse_col_mapping

    # display(lookup_dict)
    # display(data_mapped)
    return data_mapped, lookup_dict

def map_test(df_test: pd.DataFrame, lookup_dict: dict):
    data_mapped = pd.DataFrame()
    for key in tqdm(lookup_dict.keys()):
        data_mapped[key] = df_test_t[key].apply(lambda x: lookup_dict[key][x]
                                            if x in lookup_dict[key].keys()
                                            else 0)
    return data_mapped
            