In [1]:
import numpy as np
import pandas as pd

In [2]:
print('Loading data...', end='')
df_train = pd.read_csv('Train.csv')
df_test = pd.read_csv('Test.csv')
print('Done!')

Loading data...Done!


### Data preprocessing

In [3]:
print('Preprocessing data...', flush=True)

def prep(df_enc, df_enc_test):
    # Ordinal encoding
    n = 0
    for i in np.sort(df_enc.tenure.unique()):
        df_enc.loc[df_enc['tenure'] == i, 'tenure'] = n
        df_enc_test.loc[df_enc_test['tenure'] == i, 'tenure'] = n
        n += 1

    # Drop cols
    df_enc.drop(columns=['mrg', 'top_pack', 'user_id'], inplace=True)
    df_enc_test.drop(columns=['mrg', 'top_pack', 'user_id'], inplace=True)

    # Fillna
    cols = ['montant', 'frequence_rech', 'revenue', 'arpu_segment', 
            'frequence', 'data_volume', 'on_net', 'orange', 'tigo', 
            'regularity', 'freq_top_pack']

    for i in cols:
        df_enc[i].fillna(df_enc[i].mean(), inplace=True)
        df_enc_test[i].fillna(df_enc_test[i].mean(), inplace=True)

    df_enc['zone1'].fillna(-100, inplace=True)
    df_enc['zone2'].fillna(-100, inplace=True)
    df_enc_test['zone1'].fillna(-100, inplace=True)
    df_enc_test['zone2'].fillna(-100, inplace=True)

    # Creating X, y train
    y_train = df_enc.churn
    X_train = df_enc.drop(columns='churn')
    
    return X_train, y_train, df_enc_test

Preprocessing data...


### Model import

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import optuna

In [5]:
# Lower
df_train.columns = map(str.lower, df_train.columns)
df_test.columns = map(str.lower, df_test.columns)


# One hot
df_enc = pd.get_dummies(df_train, columns=['region'])
df_enc_test = pd.get_dummies(df_test, columns=['region'])


# Applying preprocessing function
X_train, y_train, df_enc_test = prep(df_enc, df_enc_test)


# Applying logreg
clf = make_pipeline(StandardScaler(), 
                    LogisticRegression(C=0.001, 
                                       penalty='elasticnet', 
                                       solver='saga', 
                                       l1_ratio=0.0))

print('\tFitting logreg...', end='', flush=True)
clf.fit(X_train, y_train)
print('Done!')


# Forming ordinal encoding
coefs = np.abs(clf.steps[1][1].coef_[0][14:]).reshape(14, 1)
regions = np.array(X_train.columns[14:]).reshape(14, 1)

reg_ordinal = np.concatenate((coefs, regions), axis=1)
reg_ordinal = reg_ordinal[reg_ordinal[:, 0].argsort()]

for i in range(reg_ordinal.shape[0]):
    reg_ordinal[i, 0] = i
    

# Creating final df
df_ord = df_train.copy(deep=True)
df_ord_test = df_test.copy(deep=True)

for i, j in reg_ordinal:
    index = j.find('_') + 1
    df_ord.loc[df_ord['region'] == j[index:], 'region'] = i
    df_ord_test.loc[df_ord_test['region'] == j[index:], 'region'] = i
    
df_ord['region'].fillna(-1, inplace=True)
df_ord_test['region'].fillna(-1, inplace=True)


X_ord_train, y_ord_train, df_ord_test = prep(df_ord, df_ord_test)
print('Done!')

	Fitting logreg...Done!
Done!


In [7]:
df_ord.head()

Unnamed: 0,region,tenure,montant,frequence_rech,revenue,arpu_segment,frequence,data_volume,on_net,orange,tigo,zone1,zone2,regularity,freq_top_pack,churn
0,4,7,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,54,8.0,0
1,-1,5,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,95.418711,23.109253,-100.0,-100.0,4,9.272461,1
2,-1,7,3600.0,2.0,1020.0,340.0,2.0,3366.450167,90.0,46.0,7.0,-100.0,-100.0,17,1.0,0
3,13,7,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,-100.0,-100.0,62,11.0,0
4,13,7,1000.0,1.0,985.0,328.0,1.0,3366.450167,39.0,24.0,23.109253,-100.0,-100.0,11,2.0,0


### LogisticRegression

In [6]:
# def objective(trial):
#     l1_ratio = trial.suggest_float('l1_ratio', 0, 1, step=0.1)
#     C_var = trial.suggest_float('log_C', -3, 5, step=1)
    
#     clf = make_pipeline(StandardScaler(), 
#                         LogisticRegression(C=10**C_var, penalty='elasticnet', solver='saga', l1_ratio=l1_ratio))
    
#     return cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=5).mean()

In [7]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=5)

In [8]:
# study.best_params

In [9]:
# Best params after gridsearch: (kaggle)
# {'logisticregression__C': 0.001, 'logisticregression__l1_ratio': 0.0}

# clf = make_pipeline(StandardScaler(), 
#                     LogisticRegression(C=0.001, penalty='elasticnet', solver='saga', l1_ratio=0.0))
# cross_val_score(clf, X_train, y_train, cv=5, verbose=1, scoring='roc_auc', n_jobs=5)

In [10]:
# clf.fit(X_train, y_train)

### RandomForestClassifier

In [11]:
# from sklearn.ensemble import RandomForestClassifier

In [12]:
# clf_forest = RandomForestClassifier(n_estimators=400, criterion='entropy', min_samples_leaf=20)

# %time cross_val_score(clf_forest, X_ord_train, y_ord_train, cv=5, n_jobs=5, scoring='roc_auc').mean()

In [13]:
# clf_forest.fit(X_ord_train, y_ord_train)

In [14]:
# res = pd.DataFrame()
# res['user_id'] = df_test.user_id
# res['CHURN'] = clf_forest.predict_proba(df_ord_test)[:, 1]

# res.to_csv('submit_final_forest.csv', index=False)
# res.head()

### CatBoost

In [27]:
import catboost
from catboost import CatBoostClassifier

In [28]:
def objective(trial):
    iterations = trial.suggest_int('iteratons', 300, 500, step=10)
    rsm = trial.suggest_float('rsm', 0.4, 1, step=0.1)
    subsample = trial.suggest_float('subsample', 0.4, 1, step=0.1)
    l2 = trial.suggest_float('l2', 0.4, 9.2, step=0.4)
    depth = trial.suggest_int('depth', 4, 16, step=1)
    lr = trial.suggest_categorical('lr', [0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1])
    
    clf = make_pipeline(StandardScaler(), 
                        CatBoostClassifier(iterations=iterations, depth=depth, rsm=rsm,
                                           subsample=subsample, custom_metric='AUC', 
                                           auto_class_weights='Balanced', l2_leaf_reg=l2,
                                           learning_rate=lr, verbose=False))
    
    return cross_val_score(clf, X_ord_train, y_ord_train, cv=5, 
                                        scoring='roc_auc', n_jobs=5).mean()

In [29]:
study_name = "cat_study"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(study_name=study_name, storage=storage_name, direction='maximize')

[32m[I 2021-09-13 02:21:51,528][0m A new study created in RDB with name: cat_study[0m


In [None]:
print('Starting optuna!', flush=True)
study.optimize(objective, n_trials=10)

study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(objective, n_trials=10)

study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(objective, n_trials=10)

study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(objective, n_trials=10)
print(study.best_params)

In [None]:
write.open('best_results.txt', 'a')
write.write(str(study.best_params))
write.close()
print('Done!')

In [19]:
clf_cat_opt = CatBoostClassifier(iterations=470, depth=12, rsm=0.8, subsample=0.98, custom_metric='AUC',
                             auto_class_weights='Balanced', l2_leaf_reg=8.4, learning_rate=0.02, verbose=False)

clf_cat_opt.fit(X_ord_train, y_ord_train)

<catboost.core.CatBoostClassifier at 0x7fe76047e850>

In [22]:
res = pd.DataFrame()
res['user_id'] = df_test.user_id
res['CHURN'] = clf_cat_opt1.predict_proba(df_ord_test)[:, 1]

res.to_csv('submit_cat.csv', index=False)
res.head()

Unnamed: 0,user_id,CHURN
0,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.008401
1,000055d41c8a62052dd426592e8a4a3342bf565d,0.232929
2,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.003443
3,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.743532
4,0000bae5480628cf8fe51ad84bcb39772fc79224,0.727977
