In [None]:
# !pip install --upgrade imbalanced-learn
# !pip install --upgrade scikit-learn
# !pip install --upgrade scikit-learn imbalanced-learn

In [86]:
#import python library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from preprocessing import *  
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from scoring import incr_act_top10
from scipy.stats.mstats import winsorize

#uplift model
from causalml.metrics import plot_gain
from xgboost.sklearn import XGBClassifier, XGBRegressor
from causalml.inference.meta import BaseTClassifier
from causalml.inference.meta import BaseSClassifier
from causalml.inference.meta import BaseXClassifier
from causalml.inference.tree import UpliftRandomForestClassifier
from causalml.dataset import make_uplift_classification

In [2]:
file_path = r'/Users/marcus/Documents/ae hackathon/dataset/65d4f0fcb8af9_amex_campus_challenge_train_3.csv' 
df = pd.read_csv(file_path)

In [8]:
# train test split 
train_set, validation_set = train_test_split(df, test_size=0.2, stratify=df[['ind_recommended', 'activation']], random_state=42)
del df 

In [12]:
# evaluation 
def evaluation(train_set, validation_set, learner = BaseSClassifier(XGBClassifier()) ): 
    learner.fit(
        
        train_set.drop(columns=['activation','ind_recommended', 'merchant', 'customer'],axis=1,inplace=False).values,
        treatment=train_set['ind_recommended'].values,
        y = train_set['activation'].values 
    )

    y_pred = learner.predict(validation_set.fillna(0).drop(columns=['merchant', 'customer', 'activation','ind_recommended'],axis=1,inplace=False).values)
    incr_act = incr_act_top10(
        validation_set[["customer", "ind_recommended", "activation"]].assign(predicted_score=y_pred),
        pred_col="predicted_score"
    )
    return incr_act

# Fillna 

In [11]:
train_set.fillna(0.0, inplace = True)
validation_set.fillna(0.0, inplace = True)

# Sampling 

In [13]:
# downsampling 
def down_sampling(df,ratio=1):
    assert ratio > 0 and ratio <= 1
    positive_num = df[df['activation'] == 1].shape[0]
    return pd.concat([df[df['activation'] == 1],df[df['activation'] == 0].sample(n=positive_num * ratio,random_state=seed)]).sample(frac=1,random_state=seed)

In [14]:
# downsampleing  
train_set_recommended = train_set[train_set['ind_recommended'] == 1]
train_set_not_recommended = train_set[train_set['ind_recommended'] == 0]

train_set_not_recommended_act = train_set_not_recommended[train_set_not_recommended['activation'] == 1].sample((train_set_recommended['activation'] == 1).sum(), random_state=seed)
train_set_not_recommended_sample = pd.concat([train_set_not_recommended_act, train_set_not_recommended[train_set_not_recommended['activation'] == 0]], axis=0)
train_set_recommended = down_sampling(train_set_recommended)
train_set_not_recommended = down_sampling(train_set_not_recommended_sample)
train_total = pd.concat([train_set_recommended,train_set_not_recommended],axis=0).sample(frac=1,random_state=seed)

del train_set_not_recommended_act
del train_set_not_recommended_sample
del train_set_recommended
del train_set_not_recommended 

print('down_sampling')
print(evaluation(train_total, validation_set))

down_sampling
original input_df:
           customer  ind_recommended  activation  predicted_score
4581435     305086                0           0        -0.011721
924745      173127                0           0        -0.079829
10078495    268892                1           0        -0.048569
4159840     339667                0           0        -0.035280
7333739     282513                0           0        -0.010647
...            ...              ...         ...              ...
9218771     155479                0           0        -0.002393
1320796     322805                0           0        -0.000297
4910594     293138                0           0        -0.002181
4356992      16425                0           0        -0.000064
216523      287329                0           0        -0.013891

[2445996 rows x 4 columns]
After numerical:
           customer  ind_recommended  activation  predicted_score
4581435     305086                0           0        -0.011721
924745    

In [15]:
# downsampling but duplicate target label data for 3 times 
train_set_recommended = train_set[train_set['ind_recommended'] == 1] 
train_set_recommended = pd.concat([train_set_recommended] * 3, ignore_index=True)
train_set_not_recommended = train_set[train_set['ind_recommended'] == 0]

train_set_not_recommended_act = train_set_not_recommended[train_set_not_recommended['activation'] == 1].sample((train_set_recommended['activation'] == 1).sum(), random_state=seed) 
train_set_not_recommended_sample = pd.concat([train_set_not_recommended_act, train_set_not_recommended[train_set_not_recommended['activation'] == 0]], axis=0)
train_set_recommended = down_sampling(train_set_recommended)
train_set_not_recommended = down_sampling(train_set_not_recommended_sample)
train_total = pd.concat([train_set_recommended,train_set_not_recommended],axis=0).sample(frac=1,random_state=seed)

del train_set_not_recommended_act
del train_set_not_recommended_sample
del train_set_recommended
del train_set_not_recommended 

print('down_sampling with 3 time duplicate data ')
print(evaluation(train_total, validation_set))

down_sampling
original input_df:
           customer  ind_recommended  activation  predicted_score
4581435     305086                0           0        -0.028963
924745      173127                0           0        -0.132116
10078495    268892                1           0        -0.027240
4159840     339667                0           0        -0.002317
7333739     282513                0           0        -0.011672
...            ...              ...         ...              ...
9218771     155479                0           0        -0.040374
1320796     322805                0           0        -0.000614
4910594     293138                0           0        -0.010766
4356992      16425                0           0        -0.000008
216523      287329                0           0        -0.004864

[2445996 rows x 4 columns]
After numerical:
           customer  ind_recommended  activation  predicted_score
4581435     305086                0           0        -0.028963
924745    

In [20]:
# upsampling by SMOTE (Synthetic Minority Oversampling Technique)
def balance_classes_with_smote(df, class_column, sample_size, seed, k_neighbors=5):
    smote = SMOTE(sampling_strategy='auto', k_neighbors=k_neighbors)

    # Sample rows where activation is 0 and where activation is 1
    filtered_non_activation = df[df[class_column] == 0].sample(n=sample_size, random_state=seed)
    filtered_activation = df[df[class_column] == 1]

    # Concatenate the filtered dataframes
    df_balanced = pd.concat([filtered_non_activation, filtered_activation], axis=0)

    # Apply SMOTE to the activation data
    X_resampled, y_resampled = smote.fit_resample(df_balanced.drop([class_column], axis=1), df_balanced[class_column])

    # Combine the resampled data
    df_resampled = pd.DataFrame(X_resampled, columns=df_balanced.drop([class_column], axis=1).columns)
    df_resampled[class_column] = y_resampled

    return df_resampled

def balance_classes(df, class_column, sample_size, seed):
    filtered_non_activation = df[df[class_column] == 0].sample(n=sample_size, random_state=seed)
    filtered_activation = df[df[class_column] == 1].sample(n=sample_size, random_state=seed)

    return pd.concat([filtered_non_activation, filtered_activation], axis=0)

train_set_recommended = train_set[train_set['ind_recommended'] == 1] 
train_set_not_recommended = train_set[train_set['ind_recommended'] == 0]

# Apply balance_classes_with_smote function to train_set_recommended
train_set_recommended = balance_classes_with_smote(train_set_recommended, 'activation', 30000, seed)

# Apply balance_classes_with_smote function to train_set_not_recommended
train_set_not_recommended = balance_classes(train_set_not_recommended, 'activation', 30000, seed)

train_total = pd.concat([train_set_recommended,train_set_not_recommended],axis=0).sample(frac=1,random_state=seed)

del train_set_recommended
del train_set_not_recommended 

print('down_sampling with SMOTE')
print(evaluation(train_total, validation_set))

down_sampling with SMOTE
original input_df:
           customer  ind_recommended  activation  predicted_score
4581435     305086                0           0        -0.128839
924745      173127                0           0        -0.257573
10078495    268892                1           0        -0.156109
4159840     339667                0           0        -0.035290
7333739     282513                0           0        -0.031063
...            ...              ...         ...              ...
9218771     155479                0           0        -0.033877
1320796     322805                0           0        -0.002491
4910594     293138                0           0        -0.019668
4356992      16425                0           0        -0.000062
216523      287329                0           0        -0.000406

[2445996 rows x 4 columns]
After numerical:
           customer  ind_recommended  activation  predicted_score
4581435     305086                0           0        -0.128839

# Standardization 

In [74]:
# downsampleing  
train_set_recommended = train_set[train_set['ind_recommended'] == 1]
train_set_not_recommended = train_set[train_set['ind_recommended'] == 0]

train_set_not_recommended_act = train_set_not_recommended[train_set_not_recommended['activation'] == 1].sample((train_set_recommended['activation'] == 1).sum(), random_state=seed)
train_set_not_recommended_sample = pd.concat([train_set_not_recommended_act, train_set_not_recommended[train_set_not_recommended['activation'] == 0]], axis=0)
train_set_recommended = down_sampling(train_set_recommended)
train_set_not_recommended = down_sampling(train_set_not_recommended_sample)
train_total = pd.concat([train_set_recommended,train_set_not_recommended],axis=0).sample(frac=1,random_state=seed)

del train_set_not_recommended_act
del train_set_not_recommended_sample
del train_set_recommended
del train_set_not_recommended 

In [42]:
# min max standardization 
def min_max_scaling(data, is_train = True, unchange_col= ['ind_recommended', 'activation']):
   df = data.copy()
   # Create a MinMaxScaler object
   scaler = MinMaxScaler()

   if is_train:
      # Drop the columns that should not be scaled
      df_to_scale = df.drop(unchange_col, axis=1)

      # Scale the DataFrame and create a new DataFrame with scaled values
      df_scaled = pd.DataFrame(scaler.fit_transform(df_to_scale), 
                              columns=df_to_scale.columns, 
                              index=df_to_scale.index)

      # Concatenate the unchanged columns and the scaled DataFrame
      df_scaled = pd.concat([df[unchange_col], df_scaled], axis=1)
   else:
      # Scale the entire DataFrame if not training
      df_scaled = pd.DataFrame(scaler.fit_transform(df), 
                              columns=df.columns)

   return df_scaled

In [45]:
print('min_max_scaling')
evaluation(min_max_scaling(train_total), min_max_scaling(validation_set, is_train = False)) 

min_max_scaling
original input_df:
          customer  ind_recommended  activation  predicted_score
0        0.659527              0.0         0.0        -0.079518
1        0.374261              0.0         0.0        -0.057959
2        0.581284              1.0         0.0        -0.131398
3        0.734283              0.0         0.0         0.000122
4        0.610729              0.0         0.0        -0.001650
...           ...              ...         ...              ...
2445991  0.336110              0.0         0.0         0.001001
2445992  0.697832              0.0         0.0        -0.000572
2445993  0.633698              0.0         0.0        -0.026975
2445994  0.035507              0.0         0.0        -0.000153
2445995  0.621140              0.0         0.0        -0.010703

[2445996 rows x 4 columns]
After numerical:
          customer  ind_recommended  activation  predicted_score
0        0.659527              0.0         0.0        -0.079518
1        0.374261     

0.0007800693920764446

In [49]:
# normalization 
def normalization(data, unchange_col=['ind_recommended', 'activation']):
    df = data.copy()
    scaler = StandardScaler()
    # Fit the scaler to the train set (excluding 'ind_recommended' and 'activation' columns) and transform it
    df_to_scale = df.drop(unchange_col, axis=1)
    df_scaled = pd.DataFrame(scaler.fit_transform(df_to_scale), 
                                columns=df_to_scale.columns, 
                                index=df_to_scale.index)

    # Add back the 'ind_recommended' and 'activation' columns
    df_scaled = pd.concat([df[unchange_col], df_scaled], axis=1)


    return df_scaled

In [51]:
print('normalization')
evaluation(normalization(train_total), normalization(validation_set))

normalization
original input_df:
           customer  ind_recommended  activation  predicted_score
4581435   0.573878                0           0        -0.085568
924745   -0.409901                0           0        -0.025575
10078495  0.304045                1           0        -0.014475
4159840   0.831686                0           0        -0.027877
7333739   0.405592                0           0        -0.060609
...            ...              ...         ...              ...
9218771  -0.541470                0           0        -0.043039
1320796   0.705977                0           0        -0.009527
4910594   0.484804                0           0        -0.095166
4356992  -1.578144                0           0         0.056735
216523    0.441497                0           0        -0.135132

[2445996 rows x 4 columns]
After numerical:
           customer  ind_recommended  activation  predicted_score
4581435   0.573878                0           0        -0.085568
924745   -

0.0007506361697316886

In [None]:
# dimensional normlaization 

path = '/Users/marcus/Documents/ae hackathon/dataset/train/' 
customer_df_path = path + 'customer_df.csv'
merchant_df_path = path + 'merchant_df.csv'
customer_merchant_df_path = path + 'customer_merchant_df.csv'
customer_industry_df_mean_path = path + 'customer_industry_df_mean.csv'
customer_industry_df_std_path = path + 'customer_industry_df_std.csv'

customer_df = pd.read_csv(customer_df_path)
customer_industry_mean_df = pd.read_csv(customer_industry_df_mean_path)
customer_industry_std_df = pd.read_csv(customer_industry_df_std_path)
merchant_df = pd.read_csv(merchant_df_path)
customer_merchant_df = pd.read_csv(customer_merchant_df_path)

customer_industry_mean_df = customer_industry_mean_df.drop(customer_industry_mean_df.columns[0], axis=1)
customer_industry_std_df = customer_industry_std_df.drop(customer_industry_std_df.columns[0], axis=1)
