In [1]:
import numpy as np
import pandas as pd
import math

In [36]:
print('Loading data...', end='')
df_train = pd.read_csv('Train.csv')
df_test = pd.read_csv('Test.csv')
print('Done!')

Loading data...Done!


In [37]:
print('Preprocessing data...', flush=True)

def prep(df_enc, df_enc_test):
    # Ordinal encoding
    n = 0
    for i in np.sort(df_enc.tenure.unique()):
        df_enc.loc[df_enc['tenure'] == i, 'tenure'] = n
        df_enc_test.loc[df_enc_test['tenure'] == i, 'tenure'] = n
        n += 1

    # Drop cols
    df_enc.drop(columns=['mrg', 'user_id'], inplace=True) # Changed row
    df_enc_test.drop(columns=['mrg', 'user_id'], inplace=True) # Changed row

    # Fillna
    cols = ['montant', 'frequence_rech', 'revenue', 'arpu_segment', 
            'frequence', 'data_volume', 'on_net', 'orange', 'tigo', 
            'regularity', 'freq_top_pack']

    for i in cols:
        df_enc[i].fillna(df_enc[i].mean(), inplace=True)
        df_enc_test[i].fillna(df_enc_test[i].mean(), inplace=True)

    df_enc['zone1'].fillna(-100, inplace=True)
    df_enc['zone2'].fillna(-100, inplace=True)
    df_enc_test['zone1'].fillna(-100, inplace=True)
    df_enc_test['zone2'].fillna(-100, inplace=True)

    # Creating X, y train
    y_train = df_enc.churn
    X_train = df_enc.drop(columns='churn')
    
    return X_train, y_train, df_enc_test

Preprocessing data...


In [38]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import optuna

In [39]:
df_train.loc[:, 'TOP_PACK'].fillna(-1, inplace=True)
df_train.loc[df_train.TOP_PACK != -1, 'TOP_PACK'] = 1

df_test.loc[:, 'TOP_PACK'].fillna(-1, inplace=True)
df_test.loc[df_test.TOP_PACK != -1, 'TOP_PACK'] = 1

In [40]:
# Lower
df_train.columns = map(str.lower, df_train.columns)
df_test.columns = map(str.lower, df_test.columns)


# One hot
df_enc = pd.get_dummies(df_train, columns=['region'])
df_enc_test = pd.get_dummies(df_test, columns=['region'])


# Applying preprocessing function
X_train, y_train, df_enc_test = prep(df_enc, df_enc_test)


# Applying logreg
clf = make_pipeline(StandardScaler(), 
                    LogisticRegression(C=0.001, 
                                       penalty='elasticnet', 
                                       solver='saga', 
                                       l1_ratio=0.0))

print('\tFitting logreg...', end='', flush=True)
clf.fit(X_train, y_train)
print('Done!')


# Forming ordinal encoding
coefs = np.abs(clf.steps[1][1].coef_[0][14:]).reshape(15, 1)
regions = np.array(X_train.columns[14:]).reshape(15, 1)

reg_ordinal = np.concatenate((coefs, regions), axis=1)
reg_ordinal = reg_ordinal[reg_ordinal[:, 0].argsort()]

for i in range(reg_ordinal.shape[0]):
    reg_ordinal[i, 0] = i
    

# Creating final df
df_ord = df_train.copy(deep=True)
df_ord_test = df_test.copy(deep=True)

for i, j in reg_ordinal:
    index = j.find('_') + 1
    df_ord.loc[df_ord['region'] == j[index:], 'region'] = i
    df_ord_test.loc[df_ord_test['region'] == j[index:], 'region'] = i
    
df_ord['region'].fillna(-1, inplace=True)
df_ord_test['region'].fillna(-1, inplace=True)


X_ord_train, y_ord_train, df_ord_test = prep(df_ord, df_ord_test)
print('Done!')

	Fitting logreg...Done!
Done!


In [27]:
df_ord.head()

Unnamed: 0,region,tenure,montant,frequence_rech,revenue,arpu_segment,frequence,data_volume,on_net,orange,tigo,zone1,zone2,regularity,top_pack,freq_top_pack,churn
0,5,7,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,54,1,8.0,0
1,-1,5,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,95.418711,23.109253,-100.0,-100.0,4,-1,9.272461,1
2,-1,7,3600.0,2.0,1020.0,340.0,2.0,3366.450167,90.0,46.0,7.0,-100.0,-100.0,17,1,1.0,0
3,14,7,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,-100.0,-100.0,62,1,11.0,0
4,14,7,1000.0,1.0,985.0,328.0,1.0,3366.450167,39.0,24.0,23.109253,-100.0,-100.0,11,1,2.0,0


In [None]:
df_train.head()

In [122]:
df_ord.head()

Unnamed: 0,region,tenure,montant,frequence_rech,revenue,arpu_segment,frequence,data_volume,on_net,orange,tigo,zone1,zone2,regularity,freq_top_pack,churn
0,4,7,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,54,8.0,0
1,-1,5,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,95.418711,23.109253,-100.0,-100.0,4,9.272461,1
2,-1,7,3600.0,2.0,1020.0,340.0,2.0,3366.450167,90.0,46.0,7.0,-100.0,-100.0,17,1.0,0
3,13,7,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,-100.0,-100.0,62,11.0,0
4,13,7,1000.0,1.0,985.0,328.0,1.0,3366.450167,39.0,24.0,23.109253,-100.0,-100.0,11,2.0,0


In [28]:
(df_train.loc[df_train.top_pack.isna(), 'churn'] == 1).value_counts(), df_train.churn.value_counts()

(Series([], Name: churn, dtype: int64),
 0    1750062
 1     403986
 Name: churn, dtype: int64)

In [142]:
df_train.head()

Unnamed: 0,user_id,region,tenure,montant,frequence_rech,revenue,arpu_segment,frequence,data_volume,on_net,orange,tigo,zone1,zone2,mrg,regularity,top_pack,freq_top_pack,churn
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,NO,54,1,8.0,0
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,-1,I 18-21 month,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,NO,4,-1,-1.0,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,-1,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,-1.0,90.0,46.0,7.0,-1.0,-1.0,NO,17,1,1.0,0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,-1.0,-1.0,NO,62,1,11.0,0
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,-1.0,39.0,24.0,-1.0,-1.0,-1.0,NO,11,1,2.0,0


In [41]:
import catboost
from catboost import CatBoostClassifier

In [42]:
clf_cat_opt = CatBoostClassifier(iterations=470, depth=12, rsm=0.8, subsample=0.98, custom_metric='AUC',
                             auto_class_weights='Balanced', l2_leaf_reg=8.4, learning_rate=0.02, verbose=False)

clf_cat_opt.fit(X_ord_train, y_ord_train)

<catboost.core.CatBoostClassifier at 0x7f873dc39790>

In [43]:
res = pd.DataFrame()
res['user_id'] = df_test.user_id
res['CHURN'] = clf_cat_opt.predict_proba(df_ord_test)[:, 1]

res.to_csv('submit_reord_cat.csv', index=False)
res.head()

Unnamed: 0,user_id,CHURN
0,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.006658
1,000055d41c8a62052dd426592e8a4a3342bf565d,0.245767
2,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.002501
3,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.75394
4,0000bae5480628cf8fe51ad84bcb39772fc79224,0.744348


In [8]:
data = pd.read_csv('untitled.txt')
data.drop('Unnamed: 1', axis=1, inplace=True)
data

Unnamed: 0,Variable Definitions,Unnamed: 2
0,,English
1,,The churn dataset includes 19 variables includ...
2,user_id,
3,REGION,the location of each client
4,TENURE,duration in the network
5,MONTANT,top-up amount
6,FREQUENCE_RECH,Â number of times the customer refilled
7,REVENUE,monthly income of each client
8,ARPU_SEGMENT,income over 90 days / 3
9,FREQUENCE,number of times the client has made an income
