In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import re

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import xgboost

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

import mlflow

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [89]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [19]:
def read_dataframe(filename):
    df = pd.read_csv(filename)
    date_cols = [i for i in list(df.columns) if re.search('date',i)]
    df= df.drop(columns=date_cols, axis=1)
    X = df.drop(['circle_id'], axis=1).iloc[:,:-1]
    y = df.iloc[:,-1]
    print(f"X shape: {X.shape}\n y.shape : {y.shape}")
    return X, y 

In [33]:
def drop_missing_cols(X_train,threshold):
    missing_percent_cols = (X_train.isnull().sum()/len(X_train))*100
    new_vars = missing_percent_cols[missing_percent_cols.le(threshold)].index
    X_train_filtered = X_train[new_vars]
    return X_train_filtered, new_vars

In [24]:
X, y = read_dataframe("data/train.csv")

X shape: (69999, 161)
 y.shape : (69999,)


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2,
                                                        random_state=123)
    

In [36]:
X_train, columns = drop_missing_cols(X_train, 40)
print(X_train.shape)
print(columns)

(55999, 134)
Index(['id', 'loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6',
       'arpu_7', 'arpu_8', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8',
       ...
       'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8', 'sachet_3g_6',
       'sachet_3g_7', 'sachet_3g_8', 'aon', 'aug_vbc_3g', 'jul_vbc_3g',
       'jun_vbc_3g'],
      dtype='object', length=134)


In [40]:
pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100))

    ]
)

In [42]:
preprocessor = ColumnTransformer([
    ('pipeline',pipeline,columns)
])

In [43]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), 
                       columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), 
                      columns=preprocessor.get_feature_names_out())

In [44]:
X_train.head()

Unnamed: 0,pipeline__pca0,pipeline__pca1,pipeline__pca2,pipeline__pca3,pipeline__pca4,pipeline__pca5,pipeline__pca6,pipeline__pca7,pipeline__pca8,pipeline__pca9,...,pipeline__pca90,pipeline__pca91,pipeline__pca92,pipeline__pca93,pipeline__pca94,pipeline__pca95,pipeline__pca96,pipeline__pca97,pipeline__pca98,pipeline__pca99
0,-1.732532,0.679236,-0.460938,-0.821699,0.500221,-0.130133,0.286959,-0.132656,0.382695,0.480882,...,0.053303,0.030658,-0.030872,-0.089224,0.034232,0.001588,-0.105276,0.012353,-0.04347,0.066526
1,7.910175,1.015989,-1.256365,-6.464776,2.818275,-1.638189,-2.55167,6.311229,1.10356,3.568079,...,0.232034,-0.076065,-0.651288,-2.519527,0.489677,0.103411,0.119701,-0.347589,-0.420804,0.503403
2,-3.259124,0.133065,-0.089215,-0.239308,-1.732697,1.377537,0.723274,1.489048,-0.153142,-0.983056,...,-0.245586,-0.293025,0.010085,-0.069031,0.002803,-0.048569,0.022753,0.009962,-0.00154,0.033866
3,-0.774358,1.789541,-0.282734,-0.620913,0.346198,-0.743539,0.068641,-0.305215,-0.727888,-0.686296,...,-0.30721,-0.284599,0.213648,-0.088585,-0.255612,0.036695,0.229778,-0.012098,-0.219625,0.296402
4,-1.574759,-0.285531,0.44088,0.061489,1.766013,-0.636008,0.047681,0.063769,-0.357975,0.084829,...,0.213435,0.019155,0.073244,-0.066424,-0.015537,0.016626,-0.023449,-0.101641,-0.090696,0.026854


In [45]:
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

In [74]:
y_pred = logistic_regression_model.predict(X_test)
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)

0.39073514602215503

In [75]:
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [76]:
mlflow.autolog()
models = {
    'LogisticRegression': LogisticRegression(),
    #'RandomForestRClassifier' : RandomForestClassifier(),
    'LightGBMClassifier' : LGBMClassifier(),
    'XGBoostClassifier' : XGBClassifier()
}

model_list = []
trained_model_list = []
accuracy_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model_name = list(models.keys())[i]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("accuracy:",accuracy)
    print("precision:",precision)
    print("recall:",recall)
    print("f1:", f1)

    accuracy_list.append(accuracy_score)
    
    
    print('='*35)
    print('\n')

2023/09/18 21:22:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2023/09/18 21:22:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2023/09/18 21:22:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


LogisticRegression
Model Training Performance
accuracy: 0.9135714285714286
precision: 0.701627486437613
recall: 0.27076064200976974
f1: 0.39073514602215503


[LightGBM] [Info] Number of positive: 5699, number of negative: 50300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 55999, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.101770 -> initscore=-2.177714
[LightGBM] [Info] Start training from score -2.177714




LightGBMClassifier
Model Training Performance
accuracy: 0.9228571428571428
precision: 0.6695485110470701
recall: 0.48639218422889047
f1: 0.5634599838318514






XGBoostClassifier
Model Training Performance
accuracy: 0.9195714285714286
precision: 0.644674835061263
recall: 0.47732030704815076
f1: 0.5485164394546913




In [83]:
train = xgboost.DMatrix(X_train, label=y_train)
valid = xgboost.DMatrix(X_test, label=y_test)


In [113]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboostclassifier")
        mlflow.log_params(params)
        booster = xgboost.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

    return {'accuracy': accuracy, 'status': STATUS_OK}

In [114]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-logloss:0.29365                                                  
[1]	validation-logloss:0.26801                                                  
[2]	validation-logloss:0.25244                                                  
[3]	validation-logloss:0.24167                                                  
[4]	validation-logloss:0.23546                                                  
[5]	validation-logloss:0.23267                                                  
[6]	validation-logloss:0.23279                                                  
[7]	validation-logloss:0.23287                                                  
[8]	validation-logloss:0.23465                                                  
[9]	validation-logloss:0.23691                                                  
[10]	validation-logloss:0.23905                                                 
[11]	validation-logloss:0.24221                                                 
[12]	validation-logloss:0.24


ERROR [hyperopt.fmin] job exception: Classification metrics can't handle a mix of binary and continuous targets


  0%|                                    | 0/50 [00:11<?, ?trial/s, best loss=?]


ValueError: Classification metrics can't handle a mix of binary and continuous targets