In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import re

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

import xgboost

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

import mlflow

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [3]:
def read_dataframe(filename):
    df = pd.read_csv(filename)
    date_cols = [i for i in list(df.columns) if re.search('date',i)]
    df= df.drop(columns=date_cols, axis=1)
    X = df.drop(['circle_id'], axis=1).iloc[:,:-1]
    y = df.iloc[:,-1]
    print(f"X shape: {X.shape}\n y.shape : {y.shape}")
    return X, y 

In [4]:
def drop_missing_cols(X_train,threshold):
    missing_percent_cols = (X_train.isnull().sum()/len(X_train))*100
    new_vars = missing_percent_cols[missing_percent_cols.le(threshold)].index
    X_train_filtered = X_train[new_vars]
    return X_train_filtered, new_vars

In [5]:
X, y = read_dataframe("data/train.csv")

X shape: (69999, 161)
 y.shape : (69999,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2,
                                                        random_state=123)
    

In [7]:
X_train, columns = drop_missing_cols(X_train, 40)
print(X_train.shape)
print(columns)

(55999, 134)
Index(['id', 'loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6',
       'arpu_7', 'arpu_8', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8',
       ...
       'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8', 'sachet_3g_6',
       'sachet_3g_7', 'sachet_3g_8', 'aon', 'aug_vbc_3g', 'jul_vbc_3g',
       'jun_vbc_3g'],
      dtype='object', length=134)


In [8]:
pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100))

    ]
)

In [9]:
preprocessor = ColumnTransformer([
    ('pipeline',pipeline,columns)
])

In [10]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), 
                       columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), 
                      columns=preprocessor.get_feature_names_out())

In [11]:
X_train.head()

Unnamed: 0,pipeline__pca0,pipeline__pca1,pipeline__pca2,pipeline__pca3,pipeline__pca4,pipeline__pca5,pipeline__pca6,pipeline__pca7,pipeline__pca8,pipeline__pca9,...,pipeline__pca90,pipeline__pca91,pipeline__pca92,pipeline__pca93,pipeline__pca94,pipeline__pca95,pipeline__pca96,pipeline__pca97,pipeline__pca98,pipeline__pca99
0,-1.732532,0.679236,-0.460938,-0.821699,0.500221,-0.130133,0.286959,-0.132656,0.382695,0.480882,...,0.053303,0.030658,-0.030872,-0.089224,0.034232,0.001588,-0.105276,0.012353,-0.04347,0.066526
1,7.910175,1.015989,-1.256365,-6.464776,2.818275,-1.638189,-2.55167,6.311229,1.10356,3.568079,...,0.232034,-0.076065,-0.651288,-2.519527,0.489677,0.103411,0.119701,-0.347589,-0.420804,0.503403
2,-3.259124,0.133065,-0.089215,-0.239308,-1.732697,1.377537,0.723274,1.489048,-0.153142,-0.983056,...,-0.245586,-0.293025,0.010085,-0.069031,0.002803,-0.048569,0.022753,0.009962,-0.00154,0.033866
3,-0.774358,1.789541,-0.282734,-0.620913,0.346198,-0.743539,0.068641,-0.305215,-0.727888,-0.686296,...,-0.30721,-0.284599,0.213648,-0.088585,-0.255612,0.036695,0.229778,-0.012098,-0.219625,0.296402
4,-1.574759,-0.285531,0.44088,0.061489,1.766013,-0.636008,0.047681,0.063769,-0.357975,0.084829,...,0.213435,0.019155,0.073244,-0.066424,-0.015537,0.016626,-0.023449,-0.101641,-0.090696,0.026854


In [12]:
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

In [13]:
y_pred = logistic_regression_model.predict(X_test)
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)

0.39073514602215503

In [14]:
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [19]:
mlflow.autolog()
models = {
    'LogisticRegression': LogisticRegression(),
    #'RandomForestRClassifier' : RandomForestClassifier(),
    'LightGBMClassifier' : LGBMClassifier(),
    'XGBoostClassifier' : XGBClassifier()
    }

model_list = []
trained_model_list = []
accuracy_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model_name = list(models.keys())[i]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("accuracy:",accuracy)
    print("precision:",precision)
    print("recall:",recall)
    print("f1:", f1)

    accuracy_list.append(accuracy_score)
    
    
    print('='*35)
    print('\n')

2023/09/20 18:12:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2023/09/20 18:12:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2023/09/20 18:12:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/09/20 18:12:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '890f4b2a0bb4417687ba4757d81a01d4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/09/20 18:12:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0d0f48764d624159abd1f58b8ad2d76d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


LogisticRegression
Model Training Performance
accuracy: 0.9135714285714286
precision: 0.701627486437613
recall: 0.27076064200976974
f1: 0.39073514602215503


[LightGBM] [Info] Number of positive: 5699, number of negative: 50300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 55999, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.101770 -> initscore=-2.177714
[LightGBM] [Info] Start training from score -2.177714


2023/09/20 18:12:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '914b8e721ec14892868a62003e8929f1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


LightGBMClassifier
Model Training Performance
accuracy: 0.9228571428571428
precision: 0.6695485110470701
recall: 0.48639218422889047
f1: 0.5634599838318514






XGBoostClassifier
Model Training Performance
accuracy: 0.9195714285714286
precision: 0.644674835061263
recall: 0.47732030704815076
f1: 0.5485164394546913




In [None]:
train = xgboost.DMatrix(X_train, label=y_train)
valid = xgboost.DMatrix(X_test, label=y_test)


In [21]:
# def objective(params):
#     with mlflow.start_run():
#         mlflow.set_tag("model", "xgboostclassifier")
#         mlflow.log_params(params)
#         booster = xgboost.train(
#             params=params,
#             dtrain=train,
#             num_boost_round=1000,
#             evals=[(valid, 'validation')],
#             early_stopping_rounds=50
#         )
#         y_pred = booster.predict(valid)
#         accuracy = accuracy_score(y_test, y_pred)
#         mlflow.log_metric("accuracy", accuracy)

#     return {'accuracy': accuracy, 'status': STATUS_OK}

In [20]:
# search_space = {
#     'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
#     'learning_rate': hp.loguniform('learning_rate', -3, 0),
#     'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
#     'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
#     'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
#     'objective': 'binary:logistic',
#     'seed': 42
# }

# best_result = fmin(
#     fn=objective,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )

In [25]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63],
    'n_estimators': [50, 100, 200],
}

In [26]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'device': 'gpu',
}

lgb_model = LGBMClassifier(**params)

In [27]:
from sklearn.model_selection import GridSearchCV

# Initialize the GridSearchCV object
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
)

In [28]:
grid_search.fit(X_train, y_train)

2023/09/20 18:17:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd3511a38adc64eba986456677a04e203', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
 generated.
.


[LightGBM] [Info] Number of positive: 4559, number of negative: 40240
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 44799, number of used features: 100
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 100 dense feature groups (4.27 MB) transferred to GPU in 0.025334 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.101766 -> initscore=-2.177758
[LightGBM] [Info] Start training from score -2.177758
[LightGBM] [Info] Number of positive: 4559, number of negative: 40240
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 44799, number of used features: 100
[LightGBM] [Info]

2023/09/20 19:10:39 INFO mlflow.sklearn.utils: Logging the 5 best runs, 76 runs will be omitted.


In [29]:
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)

print(f'Test accuracy: {test_score:.4f}')

Test accuracy: 0.9229
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 100 dense feature groups (4.27 MB) transferred to GPU in 0.045085 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.101766 -> initscore=-2.177758
[LightGBM] [Info] Start training from score -2.177758
[LightGBM] [Info] Number of positive: 4559, number of negative: 40240
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 44799, number of used features: 100
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bi

In [68]:
test_df = pd.read_csv("data/test.csv")
test_df.head()

Unnamed: 0,id,circle_id,loc_og_t2o_mou,std_og_t2o_mou,loc_ic_t2o_mou,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,arpu_6,arpu_7,...,sachet_3g_6,sachet_3g_7,sachet_3g_8,fb_user_6,fb_user_7,fb_user_8,aon,aug_vbc_3g,jul_vbc_3g,jun_vbc_3g
0,69999,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,91.882,65.33,...,0,0,0,,,,1692,0.0,0.0,0.0
1,70000,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,414.168,515.568,...,0,0,0,,,,2533,0.0,0.0,0.0
2,70001,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,329.844,434.884,...,0,0,0,,,,277,525.61,758.41,241.84
3,70002,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,43.55,171.39,...,0,0,0,,,,1244,0.0,0.0,0.0
4,70003,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,306.854,406.289,...,0,0,0,,,,462,0.0,0.0,0.0


In [69]:
output = test_df['id']
output.head()

0    69999
1    70000
2    70001
3    70002
4    70003
Name: id, dtype: int64

In [70]:
test_df1 = pd.DataFrame(preprocessor.transform(test_df), 
                      columns=preprocessor.get_feature_names_out())

In [71]:
test_df1.shape, X_test.shape

((30000, 100), (14000, 100))

In [72]:
test_df1['churn_probability'] = best_model.predict(test_df1)


In [49]:
test_df1.head()

Unnamed: 0,pipeline__pca0,pipeline__pca1,pipeline__pca2,pipeline__pca3,pipeline__pca4,pipeline__pca5,pipeline__pca6,pipeline__pca7,pipeline__pca8,pipeline__pca9,...,pipeline__pca91,pipeline__pca92,pipeline__pca93,pipeline__pca94,pipeline__pca95,pipeline__pca96,pipeline__pca97,pipeline__pca98,pipeline__pca99,churn_probability
0,-3.378789,-0.345335,-0.520182,-0.478663,0.447776,-0.072198,0.096713,0.020709,-0.018922,0.699081,...,0.026612,-0.007356,-0.054149,-0.080206,0.012509,0.009586,0.010022,-0.014246,0.004424,0
1,2.996234,1.929899,-0.201901,-2.069628,2.451942,0.49604,-0.654911,1.507858,0.097903,1.916352,...,0.03764,0.075686,0.740335,-0.168845,-0.07232,0.09543,0.008893,-0.011573,-0.134552,0
2,-0.771199,-0.654463,3.873529,0.027625,1.4595,-0.378581,0.868152,-0.441638,1.339792,-0.787206,...,0.459607,0.109304,-0.251431,-0.431004,-0.357806,0.606423,0.031094,0.023073,0.233261,0
3,-0.496549,3.319883,-1.898492,0.314848,-0.161977,1.108573,0.225528,-2.063739,0.474433,0.196699,...,0.034109,0.265646,0.35967,-0.133417,0.021826,0.049685,-0.043128,0.126099,-0.080814,0
4,1.007505,-4.169972,-0.790315,1.497918,-0.249116,-2.938412,1.708554,-0.981573,-1.147744,-0.205662,...,-0.0057,-0.125553,0.099031,0.1322,-0.016067,0.09914,-0.188553,-0.006576,-0.107108,0


In [51]:
X_test.head()

Unnamed: 0,pipeline__pca0,pipeline__pca1,pipeline__pca2,pipeline__pca3,pipeline__pca4,pipeline__pca5,pipeline__pca6,pipeline__pca7,pipeline__pca8,pipeline__pca9,...,pipeline__pca90,pipeline__pca91,pipeline__pca92,pipeline__pca93,pipeline__pca94,pipeline__pca95,pipeline__pca96,pipeline__pca97,pipeline__pca98,pipeline__pca99
0,-3.713613,-0.219286,-0.037236,0.033306,0.94101,-0.290773,0.51921,-0.112511,0.180377,0.497417,...,-0.135052,0.012305,-0.062267,-0.001631,0.131164,-0.016707,0.00605,-0.049991,0.190062,-0.060819
1,1.177094,3.219637,-1.975299,-0.342552,0.101206,1.343753,-0.149618,-0.860529,0.10641,0.221337,...,0.191441,-0.019506,-0.058472,0.197966,-0.056271,0.028726,0.022035,-0.076512,0.027201,-0.03606
2,5.143509,7.147074,5.259315,0.64586,-1.395741,-0.923625,-2.195386,-5.423882,-2.723639,-1.403866,...,-0.240946,-0.066652,-0.213236,-0.390528,0.289544,-0.492994,-0.015498,-0.337112,-0.25654,0.152212
3,9.470537,-9.645156,-3.141588,-1.407513,-3.09857,-5.148052,3.373347,-4.168058,-3.086079,-1.11175,...,-0.013379,-0.091849,-0.344061,0.37184,0.153175,-0.031224,-0.081303,0.061221,-0.123892,-0.045203
4,-1.92113,-1.536462,-0.710984,0.065647,0.469037,0.8075,-0.66122,-0.019358,0.526045,1.047961,...,0.025914,-0.009315,-0.287191,0.237049,0.187153,-0.073054,-0.068052,0.023888,-0.078836,-0.054685


In [52]:
test_df.head()

Unnamed: 0,id,circle_id,loc_og_t2o_mou,std_og_t2o_mou,loc_ic_t2o_mou,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,arpu_6,arpu_7,...,sachet_3g_6,sachet_3g_7,sachet_3g_8,fb_user_6,fb_user_7,fb_user_8,aon,aug_vbc_3g,jul_vbc_3g,jun_vbc_3g
0,69999,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,91.882,65.33,...,0,0,0,,,,1692,0.0,0.0,0.0
1,70000,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,414.168,515.568,...,0,0,0,,,,2533,0.0,0.0,0.0
2,70001,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,329.844,434.884,...,0,0,0,,,,277,525.61,758.41,241.84
3,70002,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,43.55,171.39,...,0,0,0,,,,1244,0.0,0.0,0.0
4,70003,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,306.854,406.289,...,0,0,0,,,,462,0.0,0.0,0.0


In [88]:
submission_file = pd.DataFrame({
    'id' : test_df['id'],
    'churn_probability':test_df1['churn_probability']
})

In [89]:
submission_file.head()

Unnamed: 0,id,churn_probability
0,69999,0
1,70000,0
2,70001,0
3,70002,0
4,70003,0


In [90]:
submission_file.to_csv('submission_lightgbm_!.csv',index=False)