In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/My Drive/Applied\ AI\ course\ Assignments/Case\ Study\ 1

/content/drive/My Drive/Applied AI course Assignments/Case Study 1


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#from bayes_opt import BayesianOptimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RepeatedKFold, KFold

from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import pickle

from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, accuracy_score, roc_auc_score, log_loss

from sklearn.metrics import mean_squared_error
from math import sqrt

import time
import gc
import warnings
warnings.filterwarnings('ignore')

In [None]:
#https://www.kaggle.com/fabiendaniel/elo-world
#Function to load data into pandas and reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Data Prep


In [None]:
train = reduce_mem_usage(pd.read_csv('New_Data_processed/train_features_without_merchant.csv', index_col=0))
test = reduce_mem_usage(pd.read_csv('New_Data_processed/test_features_without_merchant.csv', index_col=0))

Mem. usage decreased to 99.94 Mb (71.4% reduction)
Mem. usage decreased to 62.01 Mb (70.9% reduction)


In [None]:
train.replace([np.inf,-np.inf], np.nan, inplace=True)
test.replace([np.inf,-np.inf], np.nan, inplace=True)

In [None]:
train.columns[train.isna().any()]

Index(['transactions_duration_skew', 'transactions_price_sum',
       'transactions_price_skew', 'new_transactions_duration_skew',
       'new_transactions_price_sum', 'new_transactions_price_mean',
       'new_transactions_price_min', 'new_transactions_price_max',
       'purchase_amount_ratio', 'installments_ratio'],
      dtype='object')

In [None]:
test.columns[test.isna().any()] #target variable is Nan

Index(['target', 'transactions_duration_skew', 'transactions_price_sum',
       'transactions_price_skew', 'new_transactions_duration_skew',
       'new_transactions_price_sum', 'new_transactions_price_mean',
       'new_transactions_price_min', 'new_transactions_price_max',
       'new_purchase_date_diff_now', 'new_last_buy', 'installments_ratio'],
      dtype='object')

In [None]:
train['new_purchase_date_diff_now'].fillna(train['new_purchase_date_diff_now'].mode()[0], inplace=True) 
train['new_last_buy'].fillna(train['new_last_buy'].mode()[0], inplace=True) 
test['new_purchase_date_diff_now'].fillna(test['new_purchase_date_diff_now'].mode()[0], inplace=True) 
test['new_last_buy'].fillna(test['new_last_buy'].mode()[0], inplace=True) 
train['transactions_duration_skew'].fillna(train['transactions_duration_skew'].mode()[0], inplace=True) 
test['transactions_duration_skew'].fillna(test['transactions_duration_skew'].mode()[0], inplace=True) 
train['transactions_price_sum'].fillna(train['transactions_price_sum'].mode()[0], inplace=True) 
test['transactions_price_sum'].fillna(test['transactions_price_sum'].mode()[0], inplace=True) 
train['transactions_price_skew'].fillna(train['transactions_price_skew'].mode()[0], inplace=True) 
test['transactions_price_skew'].fillna(test['transactions_price_skew'].mode()[0], inplace=True) 
train['new_transactions_duration_skew'].fillna(train['new_transactions_duration_skew'].mode()[0], inplace=True) 
test['new_transactions_duration_skew'].fillna(test['new_transactions_duration_skew'].mode()[0], inplace=True) 
train['new_transactions_price_sum'].fillna(train['new_transactions_price_sum'].mode()[0], inplace=True) 
test['new_transactions_price_sum'].fillna(test['new_transactions_price_sum'].mode()[0], inplace=True) 
train['new_transactions_price_mean'].fillna(train['new_transactions_price_mean'].mode()[0], inplace=True) 
test['new_transactions_price_mean'].fillna(test['new_transactions_price_mean'].mode()[0], inplace=True) 
train['new_transactions_price_min'].fillna(train['new_transactions_price_min'].mode()[0], inplace=True) 
test['new_transactions_price_min'].fillna(test['new_transactions_price_min'].mode()[0], inplace=True) 
train['new_transactions_price_max'].fillna(train['new_transactions_price_max'].mode()[0], inplace=True) 
test['new_transactions_price_max'].fillna(test['new_transactions_price_max'].mode()[0], inplace=True) 
train['new_purchase_date_diff_now'].fillna(train['new_purchase_date_diff_now'].mode()[0], inplace=True) 
test['new_purchase_date_diff_now'].fillna(test['new_purchase_date_diff_now'].mode()[0], inplace=True) 
train['installments_ratio'].fillna(train['installments_ratio'].mode()[0], inplace=True) 
test['installments_ratio'].fillna(test['installments_ratio'].mode()[0], inplace=True) 
train['purchase_amount_ratio'].fillna(train['purchase_amount_ratio'].mode()[0], inplace=True) 
test['purchase_amount_ratio'].fillna(test['purchase_amount_ratio'].mode()[0], inplace=True) 

In [None]:
train.columns[train.isna().any()]

Index([], dtype='object')

In [None]:
test.columns[test.isna().any()]

Index(['target'], dtype='object')

In [None]:
## Load the predicted labels from Binary Classification Model

with open('Binary_Classification_predictions/predicted_labels_train.pkl','rb') as f:
  train_labels = pickle.load(f)

with open('Binary_Classification_predictions/predicted_labels_test.pkl','rb') as f:
  test_labels = pickle.load(f)

with open('Binary_Classification_predictions/prob_labels_train.pkl', 'rb') as f:
  train_prob = pickle.load(f)

with open('Binary_Classification_predictions/prob_labels_test.pkl', 'rb') as f:
  test_prob = pickle.load(f)


In [None]:
train['outlier_prob'] = train_prob
train['outlier_pred'] = train_labels
test['outlier_prob'] = test_prob
test['outlier_pred'] = test_labels

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201917 entries, 0 to 201916
Columns: 228 entries, card_id to outlier_pred
dtypes: float16(172), float32(5), float64(11), int16(15), int32(2), int64(1), int8(21), object(1)
memory usage: 103.0+ MB


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 123623 entries, 0 to 123622
Columns: 227 entries, card_id to outlier_pred
dtypes: float16(171), float32(8), float64(11), int16(12), int32(3), int64(1), int8(20), object(1)
memory usage: 63.9+ MB


In [None]:
#Load the full regression predictions

with open('train_predictions_full_regression.pkl', 'rb') as f:
  full_regression_train_preds = pickle.load(f)

with open('test_predictions_full_regression.pkl', 'rb') as f:
  full_regression_test_preds = pickle.load(f)


In [None]:
train = train.merge(full_regression_train_preds, on='card_id', how='left')
test = test.merge(full_regression_test_preds, on='card_id', how='left')

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201917 entries, 0 to 201916
Columns: 229 entries, card_id to full_regression_prediction
dtypes: float16(172), float32(5), float64(12), int16(15), int32(2), int64(1), int8(21), object(1)
memory usage: 104.6+ MB


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 123623 entries, 0 to 123622
Columns: 228 entries, card_id to full_regression_prediction
dtypes: float16(171), float32(8), float64(12), int16(12), int32(3), int64(1), int8(20), object(1)
memory usage: 64.8+ MB


In [None]:
train_card_ids = train['card_id']
train_target = train['target']

In [None]:
test_card_ids = test['card_id']
test.drop(columns=['card_id', 'target'], axis=1, inplace=True)

# High Prob Model


**This would be a stacking model**

In [None]:
# Create the dataset for training and test
high_prob_data = train[train['outlier_pred'] == 1]
high_prob_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22831 entries, 5 to 201908
Columns: 229 entries, card_id to full_regression_prediction
dtypes: float16(172), float32(5), float64(12), int16(15), int32(2), int64(1), int8(21), object(1)
memory usage: 11.8+ MB


In [None]:
rare_outliers = high_prob_data[high_prob_data['rare_datapoints'] == 0]
rare_outliers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21242 entries, 5 to 201908
Columns: 229 entries, card_id to full_regression_prediction
dtypes: float16(172), float32(5), float64(12), int16(15), int32(2), int64(1), int8(21), object(1)
memory usage: 11.0+ MB


So we have 1589 correctly predicted points as outliers. 
<br>
 

In [None]:
high_prob_data = high_prob_data[high_prob_data['rare_datapoints'] == 1]
rare_outliers = rare_outliers.sample(411)

In [None]:
high_prob_data = high_prob_data.append(rare_outliers, ignore_index=True)
high_prob_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 229 entries, card_id to full_regression_prediction
dtypes: float16(172), float32(5), float64(12), int16(15), int32(2), int64(1), int8(21), object(1)
memory usage: 1.0+ MB


In [None]:
#Load the top 150 features
#Start with top 50

with open('top_150_features.pkl', 'rb') as f:
  top_features = pickle.load(f)

top_features = list(top_features[:10])

In [None]:
top_features.append('full_regression_prediction')
top_features.append('outlier_prob')
top_features.append('outlier_pred')

In [None]:
y_high_prob = high_prob_data['target']
high_prob_card_ids = high_prob_data['card_id']
high_prob_data.drop(columns=['card_id', 'target', 'rare_datapoints'], axis=1, inplace=True)

In [None]:
high_prob_data = high_prob_data[top_features]
train = train[top_features]
test = test[top_features]

In [None]:
folds_stack = KFold(n_splits=5, shuffle=True, random_state=4590)
oof_high_pred = np.zeros(high_prob_data.shape[0])
train_pred = np.zeros(train.shape[0])
pred_high_prob = np.zeros(test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(high_prob_data,y_high_prob)):
    print("fold {}".format(fold_))
    trn_data, trn_y = high_prob_data.iloc[trn_idx], y_high_prob.iloc[trn_idx]
    val_data, val_y = high_prob_data.iloc[val_idx], y_high_prob.iloc[val_idx]
    
    #clf_3 = lgb.LGBMRegressor()
    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)
    
    oof_high_pred[val_idx] = clf_3.predict(val_data)
    train_pred += clf_3.predict(train) / 5
    pred_high_prob += clf_3.predict(test) / 5
    
np.sqrt(mean_squared_error(y_high_prob.values, oof_high_pred))

fold 0
fold 1
fold 2
fold 3
fold 4


13.29265833591198

In [None]:
cv_rmse = sqrt(mean_squared_error(y_high_prob.values, oof_high_pred))
print("CV RMSE: {:2.5f}".format(cv_rmse))

CV RMSE: 13.29266


In [None]:
cv_rmse = sqrt(mean_squared_error(train_pred, train_target))
print("Entire Train data RMSE: {:2.5f}".format(cv_rmse))

Entire Train data RMSE: 24.78733


In [None]:
print(max(train_pred))
print(min(train_pred))

-21.87140376655802
-123.09560275003373


In [None]:
print(max(pred_high_prob))
print(min(pred_high_prob))

-21.8131266126191
-326.7762898232499


# Evaluate the models on entire train data:

In [None]:
high_prob_train = pd.DataFrame()
high_prob_train['card_id'] = train_card_ids
high_prob_train['high_prob_score'] = train_pred

In [None]:
high_prob_train.head()

Unnamed: 0,card_id,high_prob_score
0,C_ID_92a2005557,-23.407989
1,C_ID_3d0044924f,-25.568459
2,C_ID_d639edf6cd,-25.956868
3,C_ID_186d6a6901,-24.2023
4,C_ID_cdbd2c0db2,-23.421759


In [None]:
high_prob_test = pd.DataFrame()
high_prob_test['card_id'] = test_card_ids
high_prob_test['high_prob_score'] = pred_high_prob

Clip under -33.218750

In [None]:
#high_prob_train['high_prob_score'] = high_prob_train['high_prob_score'].apply(lambda x: -33.218750 if x < -33.218750 else x)
#high_prob_test['high_prob_score'] = high_prob_test['high_prob_score'].apply(lambda x: -33.218750 if x < -33.218750 else x)

In [None]:
cv_rmse = sqrt(mean_squared_error(high_prob_train['high_prob_score'], train_target))
print("CV RMSE: {:2.5f}".format(cv_rmse))

CV RMSE: 24.78733


In [None]:
with open('Low_and_High_prob_predictions/high_prob_train_predictions.pkl', 'wb') as f:
  pickle.dump(high_prob_train, f)

with open('Low_and_High_prob_predictions/high_prob_test_predictions.pkl', 'wb') as f:
  pickle.dump(high_prob_test, f)