# **Importing the Libraries**

In [1]:
pip install "dask[dataframe]" --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc 
import warnings
import dask.dataframe as dd
from matplotlib.pyplot import figure
sns.set()
warnings.filterwarnings("ignore")

**To optimize the memory usage**

In [3]:
def adjust_datatype(df):
    """
    This function adjust the datatypes of columns based upon their range
    
    Input take DataFrame
    
    example:
    if a column's maximum and minimum values are in range of (-128 to 127) then datatype
    to store that column is changed to int8
    
    Reference:
    https://www.kaggle.com/jeru666/did-you-think-of-these-features
    
    """
    int_cols = list(df.select_dtypes(include=['int']).columns)
    for col in int_cols:
        if ((np.max(df[col]) <= 127) and (np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and (np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and (np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)          
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)
        
        
        
        
        
def restart_kernel():
    """
    Used to get rid of unnessary variable
    
    https://stackoverflow.com/questions/37751120/restart-ipython-kernel-with-a-command-from-a-cell
    """
    from IPython.core.display import HTML
    HTML("<script>Jupyter.notebook.kernel.restart()</script>")
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import gc 
    import warnings
    import dask.dataframe as dd
    from matplotlib.pyplot import figure
    sns.set()
    warnings.filterwarnings("ignore")

In [4]:
#Reading the combined dataset
combined = pd.read_csv('/content/drive/MyDrive/KKbox_data/AM_Datasets/combined_AM_3.csv')

**Preparing the data for modeling**

In [5]:
import re
combined['transaction_date'] = combined['transaction_date'].apply(lambda x:int(re.sub('-','',str(x))))

In [6]:
combined['membership_expire_date'] = combined['membership_expire_date'].apply(lambda x:int(re.sub('-','',str(x))))
combined['date'] = combined['date'].apply(lambda x:int(re.sub('-','',str(x))))

In [7]:
combined  = combined.drop_duplicates('msno', keep='first', inplace=False)

In [8]:
train_data = pd.read_csv('/content/drive/MyDrive/KKbox_data/train_v2.csv')

In [9]:
train_data=pd.merge(train_data,combined,on='msno',how='left')
train_data.fillna(0,inplace=True)

In [10]:
train_data  = train_data.drop_duplicates('msno', keep='first', inplace=False)

In [11]:
train_features = train_data.columns
train_features = list(train_features)
train_features.remove('is_churn')
train_features.remove('msno')
train_data.head()

Unnamed: 0,msno,is_churn,bd,registered_via,city_feature,registration_method_f,year,month_feature,year_feature,payment_method_id,...,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,total_secs_mean,active_days,inactive_days
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,28.0,3.0,0.0,0.0,2013.0,1.0,1.0,0.0,...,7.0,0.0,3.0,0.0,71.0,68.0,17599.893,17599.893,1.0,30.0
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,20.0,3.0,1.0,0.0,2013.0,1.0,1.0,36.0,...,0.0,1.0,1.0,0.0,0.0,2.0,217.548,217.548,1.0,30.0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,18.0,3.0,1.0,0.0,2013.0,1.0,1.0,17.0,...,10.0,1.0,1.0,1.0,3.0,15.0,1249.3,1249.3,1.0,30.0
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,28.0,7.0,1.0,1.0,2014.0,1.0,1.0,41.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,35.0,7.0,1.0,1.0,2014.0,1.0,1.0,41.0,...,1.0,0.0,1.0,0.0,36.0,36.0,8746.115,8746.115,1.0,30.0


**# Preparing the data for balancing**

In [12]:
churned_user=train_data[train_data['is_churn']==1]
not_churned_user=train_data[train_data['is_churn']==0]
not_churned_user_sampled=not_churned_user.sample(frac=0.7)
new_sampled_data=not_churned_user_sampled.append(churned_user,ignore_index=True)
new_sampled_data.head()

Unnamed: 0,msno,is_churn,bd,registered_via,city_feature,registration_method_f,year,month_feature,year_feature,payment_method_id,...,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,total_secs_mean,active_days,inactive_days
0,CY2/uUP2AgQiHm19f0ihdttcdJF8M5XgETOzsRgCSpw=,0,24.0,7.0,0.0,1.0,2010.0,1.0,1.0,41.0,...,0.0,0.0,0.0,0.0,3.0,1.0,960.0,960.0,1.0,30.0
1,R2y2L2wkpN3vU7L/tzXjKdY0ORX5NvDP8TrHxbGVRvE=,0,28.0,3.0,1.0,0.0,2015.0,1.0,0.0,0.0,...,9.0,1.0,3.0,0.0,42.0,54.0,10566.729,10566.729,1.0,30.0
2,nnNRppdpzjaOAzJ0DiiFU+LFek0uN5r27OIWq9I8L/0=,0,28.0,7.0,1.0,1.0,2017.0,1.0,0.0,41.0,...,3.0,2.0,0.0,0.0,2.0,3.0,779.677,779.677,1.0,30.0
3,dMm0XLU6ZAj36cLGt50L0BADbGCs8tOO7VLwWPkNt3o=,0,33.0,9.0,1.0,1.0,2006.0,0.0,1.0,39.0,...,3.0,3.0,0.0,1.0,28.0,33.0,6878.527,6878.527,1.0,30.0
4,71Evoio1j44c9AKUIJcYcsVrW2RFj1OENk0ODJ9gYik=,0,24.0,4.0,1.0,0.0,2017.0,1.0,0.0,36.0,...,5.0,0.0,2.0,1.0,55.0,53.0,14204.166,14204.166,1.0,30.0


In [13]:
X_train=new_sampled_data[train_features]
y_train=new_sampled_data['is_churn']

In [14]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE
from sklearn.calibration import CalibratedClassifierCV
sns.set()
from matplotlib.pyplot import figure

**# Balancing data Using SMOTE**

In [15]:
#using smote to balance the imbalance in the dataset
smote=SMOTE(random_state=110,n_jobs=-1)
X_bal,y_bal=smote.fit_resample(X_train,y_train)

In [16]:
impute=dict(X_bal.mean())
import pickle
output = open('impute.pkl', 'wb')
pickle.dump(impute, output)
output.close()
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(X_bal)
X_bal=sc.transform(X_bal)
X_bal.shape

(1237082, 33)

In [17]:
with open("sc.pkl", 'wb') as output:
    pickle.dump(sc, output, pickle.HIGHEST_PROTOCOL)
print(X_bal.shape,y_bal.shape)

(1237082, 33) (1237082,)


**# loading test data**

In [18]:
test_data=pd.read_csv('/content/drive/MyDrive/KKbox_data/sample_submission_v2.csv')
test_data=pd.merge(test_data,combined,on='msno',how='left')

In [19]:
test_data.fillna(impute,inplace=True)

In [20]:
test_data  = test_data.drop_duplicates('msno', keep='first', inplace=False)

In [21]:
X_test=test_data[train_features]
X_test=sc.transform(X_test)

In [22]:
print(X_bal.shape,X_test.shape)

(1237082, 33) (907471, 33)


**# Splitting the data**

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_cv,y_train,y_cv=train_test_split(X_bal,y_bal,test_size=0.3,random_state=110,stratify=y_bal)

In [24]:
y_train=y_train.values
y_train=y_train.reshape(-1,1)
y_cv=y_cv.values
y_cv=y_cv.reshape(-1,1)
print(X_train.shape,y_train.shape)

(865957, 33) (865957, 1)


# **Advanced Modeling - Deep learning model -MLP**

**# MLP Model**

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    BatchNormalization, SeparableConv2D, MaxPooling2D, Activation, Flatten, Dropout, Dense)

In [26]:
from tensorflow.keras.layers import BatchNormalization

In [27]:
from keras.models import Sequential
import keras
from keras.layers import Dense, Dropout
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [28]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [29]:
# loading library
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable

from sklearn.metrics import log_loss

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Flatten, Concatenate, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

**Defining Custom callback function**

In [30]:
import tensorflow as tf

# Lets us define some custom callbacks
# setting F1 score as the Metric 
# stopping if accuracy does not increase
class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self,X_cv,y_cv):
        self.x = X_cv
        self.y = y_cv
    def on_train_begin(self, logs={}):
        self.history={'loss': [],'F1_score':[]}
      
    def on_epoch_end(self, epoch, logs={}):
        self.history['loss'].append(logs.get('loss'))
        y_pred = (self.model.predict(self.x) > 0.5).astype("int32")
        F1_score=f1_score(self.y, y_pred, average='micro')
        self.history['F1_score'].append(F1_score)
        print("F1 Score  {}".format(self.history['F1_score'][0]))
               
history_own = CustomCallback(X_cv,y_cv)

from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3,verbose=1)

callback_list = [history_own,early_stop]

In [31]:
#to know the current tensorflow version
tf . print(tf. __version__) 

2.8.2


In [32]:
# After hyperparameter tuning, we get 512,32 as best output
# Note: Hyperparameter tuning and selection is done separatly to avoid memory outage in Colab
model = Sequential()

# Dense hidden layer 1
model.add(Dense(512, input_dim=int(X_train.shape[1]),activation='relu'))

# Batch Normalization layer 1
model.add(BatchNormalization())

# Dropout layer 1
model.add(Dropout(rate=0.25))

# Dense hidden layer 2
model.add(Dense(32, activation='relu'))

# Batch Normalization layer 2
model.add(BatchNormalization())

# Dropout layer 2
model.add(Dropout(rate=0.25))

# Dense hidden layer 3
model.add(Dense(32, activation='relu'))

model.add(Dropout(rate=0.1))
model.add(Dense(1, activation='sigmoid'))

# summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               17408     
                                                                 
 batch_normalization (BatchN  (None, 512)              2048      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                16416     
                                                                 
 batch_normalization_1 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 32)                0

In [33]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
# to store the best model
filepath = "/content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [35]:
callback_list = [history_own, early_stop, checkpoint]

In [36]:
# Fit the model
model.fit(X_train, y_train, epochs=50, batch_size=2**13,validation_data=(X_cv,y_cv), verbose=1,callbacks=callback_list)

Epoch 1/50

Epoch 1: val_loss improved from inf to 0.45743, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5
Epoch 2/50

Epoch 2: val_loss improved from 0.45743 to 0.38665, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5
Epoch 3/50

Epoch 3: val_loss improved from 0.38665 to 0.35065, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5
Epoch 4/50

Epoch 4: val_loss improved from 0.35065 to 0.28341, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5
Epoch 5/50

Epoch 5: val_loss improved from 0.28341 to 0.20991, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5
Epoch 6/50

Epoch 6: val_loss improved from 0.20991 to 0.18666, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5
Epoch 7/50

Epoch 7: val_loss improved from 0.18666 to 0.17807, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_MLP.h5
Epoch 8/50

Epoch 8: val_loss i

<keras.callbacks.History at 0x7f0b8ae83a50>

# **Advanced Modelling - Deep Learning Model - CNN**

**#CNN Model**

In [37]:
X_train=np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))
X_cv=np.reshape(X_cv,(X_cv.shape[0],X_cv.shape[1],1))

In [38]:
import tensorflow as tf
from sklearn.metrics import f1_score
# defining some custom callbacks 
# looking on F1 score as i am watching that metric
#Sending my own stratified X_cv 
# stopping if accuracy does not increase
class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self,X_cv,y_cv):
        self.x = X_cv
        self.y = y_cv
    def on_train_begin(self, logs={}):
        self.history={'loss': [],'F1_score':[]}

            
    def on_epoch_end(self, epoch, logs={}):
        self.history['loss'].append(logs.get('loss'))
        y_pred= (self.model.predict(self.x) > 0.5).astype("int32")
        F1_score=f1_score(self.y, y_pred, average='micro')
        self.history['F1_score'].append(F1_score)
        print("F1 Score  {}".format(self.history['F1_score'][0]))
               
history_own=CustomCallback(X_cv,y_cv)

from tensorflow.keras.callbacks import EarlyStopping

early_stop=EarlyStopping(monitor='val_loss', patience=3,verbose=1)



callback_list=[history_own,early_stop]

In [39]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers import MaxPool1D

In [40]:
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

After, hyperparamter tuning, we get the best results from 32,32

In [41]:
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1:])))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPool1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 31, 32)            128       
                                                                 
 conv1d_1 (Conv1D)           (None, 29, 32)            3104      
                                                                 
 dropout_3 (Dropout)         (None, 29, 32)            0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 14, 32)           0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 448)               0         
                                                                 
 dense_4 (Dense)             (None, 100)               44900     
                                                      

In [42]:
# to store the best model
filepath = "/content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [43]:
callback_list = [history_own, early_stop, checkpoint]

In [44]:
# Fit the model
model.fit(X_train, y_train, epochs=50, batch_size=2**16,validation_data=(X_cv,y_cv), verbose=1,callbacks=callback_list)

Epoch 1/50




Epoch 1: val_loss improved from inf to 0.40716, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5
Epoch 2/50

Epoch 2: val_loss improved from 0.40716 to 0.33169, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5
Epoch 3/50

Epoch 3: val_loss improved from 0.33169 to 0.31315, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5
Epoch 4/50

Epoch 4: val_loss improved from 0.31315 to 0.28777, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5
Epoch 5/50

Epoch 5: val_loss improved from 0.28777 to 0.26766, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5
Epoch 6/50

Epoch 6: val_loss improved from 0.26766 to 0.24732, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5
Epoch 7/50

Epoch 7: val_loss improved from 0.24732 to 0.23316, saving model to /content/drive/MyDrive/KKbox_data/Models/best_model_CNN.h5
Epoch 8/50

Epoch 8: val_loss improved fro

<keras.callbacks.History at 0x7f0b5a074e50>