# Import Library & Datasets

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
 
import lightgbm as lgb
from sklearn.metrics import log_loss 
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn import neighbors
from sklearn.preprocessing import LabelEncoder,RobustScaler
import category_encoders as ce

In [28]:
train=pd.read_csv('Train.csv')
to_drop=['user_id','MRG']
train=train.drop(to_drop,axis=1)

In [29]:
test=pd.read_csv('Test.csv')
to_drop=['user_id','MRG']
test=test.drop(to_drop,axis=1)

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   REGION          242480 non-null  object 
 1   TENURE          400000 non-null  object 
 2   MONTANT         259723 non-null  float64
 3   FREQUENCE_RECH  259723 non-null  float64
 4   REVENUE         265337 non-null  float64
 5   ARPU_SEGMENT    265337 non-null  float64
 6   FREQUENCE       265337 non-null  float64
 7   DATA_VOLUME     203146 non-null  float64
 8   ON_NET          254181 non-null  float64
 9   ORANGE          233683 non-null  float64
 10  TIGO            160614 non-null  float64
 11  ZONE1           31690 non-null   float64
 12  ZONE2           25513 non-null   float64
 13  REGULARITY      400000 non-null  int64  
 14  TOP_PACK        232671 non-null  object 
 15  FREQ_TOP_PACK   232671 non-null  float64
 16  CHURN           400000 non-null  int64  
dtypes: float64

In [31]:
train.isnull().sum()

REGION            157520
TENURE                 0
MONTANT           140277
FREQUENCE_RECH    140277
REVENUE           134663
ARPU_SEGMENT      134663
FREQUENCE         134663
DATA_VOLUME       196854
ON_NET            145819
ORANGE            166317
TIGO              239386
ZONE1             368310
ZONE2             374487
REGULARITY             0
TOP_PACK          167329
FREQ_TOP_PACK     167329
CHURN                  0
dtype: int64

lag_features=['REVENUE','ARPU_SEGMENT']

for feat in lag_features:
    for i in range(3):
        train[feat+'_lag'+str(i+1)] = train[feat].shift(i+1)
        test[feat+'_lag'+str(i+1)] = test[feat].shift(i+1)

In [32]:
train['TOP_PACK']=train['TOP_PACK'].fillna('None')
train['REGION']=train['REGION'].fillna('None')

test['TOP_PACK']=test['TOP_PACK'].fillna('None')
test['REGION']=test['REGION'].fillna('None')

In [34]:
for i in ['MONTANT','FREQUENCE_RECH','REVENUE',
          'ARPU_SEGMENT','FREQUENCE','DATA_VOLUME',
         'ON_NET','ORANGE','TIGO','ZONE1','ZONE2','FREQ_TOP_PACK']:
    train[i]=train[i].fillna(-9999)
    test[i]=test[i].fillna(-9999)

In [35]:
le=LabelEncoder()
cat_cols = train.select_dtypes(include='object').columns
cat_cols

catt_cols = test.select_dtypes(include='object').columns
catt_cols

Index(['REGION', 'TENURE', 'TOP_PACK'], dtype='object')

In [36]:
for i in cat_cols:
    train[i]=le.fit_transform(train[i])
for i in catt_cols:
    test[i]=le.fit_transform(test[i])

In [37]:
train.isnull().sum()

REGION            0
TENURE            0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
TIGO              0
ZONE1             0
ZONE2             0
REGULARITY        0
TOP_PACK          0
FREQ_TOP_PACK     0
CHURN             0
dtype: int64

In [38]:
train.head()

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,10,7,17000.0,32.0,18000.0,6000.0,34.0,0.0,97.0,355.0,6.0,0.0,0.0,62,15,35.0,0
1,9,7,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,0.0,2.0,40,25,22.0,0
2,12,7,1500.0,3.0,1500.0,500.0,3.0,0.0,30.0,30.0,0.0,0.0,0.0,32,15,3.0,0
3,2,7,1500.0,3.0,2497.0,832.0,4.0,0.0,159.0,45.0,19.0,0.0,0.0,18,85,3.0,0
4,2,7,0.0,0.0,498.0,166.0,3.0,1.0,1.0,3.0,0.0,0.0,0.0,50,83,0.0,0


In [39]:
x=np.array(train.drop(['CHURN'],1))
y=np.array(train['CHURN'])

In [40]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=1)

In [41]:
#Feature Scaling
rs=RobustScaler()
x_train=rs.fit_transform(x_train)
x_test=rs.fit_transform(x_test)

In [42]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((300000, 16), (300000,), (100000, 16), (100000,))

In [43]:
Xtest=test

In [45]:
errlgb = []
y_pred_totlgb = []

fold = StratifiedKFold(n_splits=10, shuffle=False, random_state=42)

for train_index,test_index in fold.split(x,y):
    x_train,x_test=x[train_index], x[test_index]
    y_train,y_test=y[train_index], y[test_index]
    
    estimator=lgb.LGBMClassifier(learning_rate=0.150, metric='l1', 
                                early_stopping_rounds =200, 
                                eval_metric ='binary_logloss',max_depth = -1,
                                n_estimators =100)
    
    estimator.fit(x_train,y_train, eval_set=[(x_train,y_train), (x_test,y_test)])
    
    y_pred = estimator.predict_proba(x_test)[:,1] 

    print("logloss: ",log_loss(y_test, y_pred))
    
    errlgb.append(log_loss(y_test, y_pred))
    
    preds=estimator.predict_proba(Xtest)[:,1]
    y_pred_totlgb.append(preds)

[1]	training's l1: 0.283379	valid_1's l1: 0.283106
Training until validation scores don't improve for 200 rounds
[2]	training's l1: 0.266186	valid_1's l1: 0.265722
[3]	training's l1: 0.251641	valid_1's l1: 0.251047
[4]	training's l1: 0.239246	valid_1's l1: 0.23852
[5]	training's l1: 0.228646	valid_1's l1: 0.227792
[6]	training's l1: 0.219581	valid_1's l1: 0.218633
[7]	training's l1: 0.211817	valid_1's l1: 0.210788
[8]	training's l1: 0.205152	valid_1's l1: 0.204073
[9]	training's l1: 0.199437	valid_1's l1: 0.198301
[10]	training's l1: 0.194517	valid_1's l1: 0.193326
[11]	training's l1: 0.190305	valid_1's l1: 0.189078
[12]	training's l1: 0.186692	valid_1's l1: 0.185433
[13]	training's l1: 0.183565	valid_1's l1: 0.182284
[14]	training's l1: 0.180917	valid_1's l1: 0.179631
[15]	training's l1: 0.178634	valid_1's l1: 0.177341
[16]	training's l1: 0.176716	valid_1's l1: 0.17543
[17]	training's l1: 0.175019	valid_1's l1: 0.17373
[18]	training's l1: 0.173572	valid_1's l1: 0.172278
[19]	training'

[54]	training's l1: 0.163835	valid_1's l1: 0.165046
[55]	training's l1: 0.163685	valid_1's l1: 0.164917
[56]	training's l1: 0.16372	valid_1's l1: 0.164914
[57]	training's l1: 0.163598	valid_1's l1: 0.164875
[58]	training's l1: 0.163625	valid_1's l1: 0.164945
[59]	training's l1: 0.163744	valid_1's l1: 0.165078
[60]	training's l1: 0.163724	valid_1's l1: 0.165012
[61]	training's l1: 0.163561	valid_1's l1: 0.164931
[62]	training's l1: 0.163926	valid_1's l1: 0.165249
[63]	training's l1: 0.163707	valid_1's l1: 0.165085
[64]	training's l1: 0.163595	valid_1's l1: 0.165028
[65]	training's l1: 0.163541	valid_1's l1: 0.165007
[66]	training's l1: 0.163504	valid_1's l1: 0.164991
[67]	training's l1: 0.16377	valid_1's l1: 0.16554
[68]	training's l1: 0.163469	valid_1's l1: 0.165039
[69]	training's l1: 0.163512	valid_1's l1: 0.165114
[70]	training's l1: 0.163475	valid_1's l1: 0.165105
[71]	training's l1: 0.163452	valid_1's l1: 0.165045
[72]	training's l1: 0.163362	valid_1's l1: 0.164947
[73]	training's

[6]	training's l1: 0.219483	valid_1's l1: 0.218983
[7]	training's l1: 0.211711	valid_1's l1: 0.211174
[8]	training's l1: 0.205043	valid_1's l1: 0.204465
[9]	training's l1: 0.199317	valid_1's l1: 0.198717
[10]	training's l1: 0.1944	valid_1's l1: 0.193801
[11]	training's l1: 0.190198	valid_1's l1: 0.189601
[12]	training's l1: 0.18657	valid_1's l1: 0.18597
[13]	training's l1: 0.18347	valid_1's l1: 0.182879
[14]	training's l1: 0.18081	valid_1's l1: 0.180209
[15]	training's l1: 0.178519	valid_1's l1: 0.177924
[16]	training's l1: 0.176543	valid_1's l1: 0.175955
[17]	training's l1: 0.174837	valid_1's l1: 0.174262
[18]	training's l1: 0.173381	valid_1's l1: 0.17282
[19]	training's l1: 0.172102	valid_1's l1: 0.171552
[20]	training's l1: 0.171034	valid_1's l1: 0.1705
[21]	training's l1: 0.170091	valid_1's l1: 0.169568
[22]	training's l1: 0.169302	valid_1's l1: 0.168797
[23]	training's l1: 0.168634	valid_1's l1: 0.168143
[24]	training's l1: 0.168056	valid_1's l1: 0.167572
[25]	training's l1: 0.167

[61]	training's l1: 0.16376	valid_1's l1: 0.164472
[62]	training's l1: 0.163831	valid_1's l1: 0.164542
[63]	training's l1: 0.163746	valid_1's l1: 0.164479
[64]	training's l1: 0.163734	valid_1's l1: 0.164453
[65]	training's l1: 0.163686	valid_1's l1: 0.164422
[66]	training's l1: 0.163605	valid_1's l1: 0.164355
[67]	training's l1: 0.163568	valid_1's l1: 0.164338
[68]	training's l1: 0.163518	valid_1's l1: 0.164301
[69]	training's l1: 0.163485	valid_1's l1: 0.164292
[70]	training's l1: 0.16351	valid_1's l1: 0.164395
[71]	training's l1: 0.163658	valid_1's l1: 0.164506
[72]	training's l1: 0.163438	valid_1's l1: 0.16429
[73]	training's l1: 0.163486	valid_1's l1: 0.164463
[74]	training's l1: 0.163384	valid_1's l1: 0.164264
[75]	training's l1: 0.163366	valid_1's l1: 0.164239
[76]	training's l1: 0.163332	valid_1's l1: 0.164216
[77]	training's l1: 0.163285	valid_1's l1: 0.164199
[78]	training's l1: 0.163388	valid_1's l1: 0.164266
[79]	training's l1: 0.163353	valid_1's l1: 0.164463
[80]	training's

[13]	training's l1: 0.183284	valid_1's l1: 0.183799
[14]	training's l1: 0.180614	valid_1's l1: 0.181162
[15]	training's l1: 0.178329	valid_1's l1: 0.178894
[16]	training's l1: 0.176346	valid_1's l1: 0.176927
[17]	training's l1: 0.174625	valid_1's l1: 0.175231
[18]	training's l1: 0.173175	valid_1's l1: 0.173796
[19]	training's l1: 0.171893	valid_1's l1: 0.172525
[20]	training's l1: 0.17082	valid_1's l1: 0.171468
[21]	training's l1: 0.169912	valid_1's l1: 0.170581
[22]	training's l1: 0.169058	valid_1's l1: 0.169742
[23]	training's l1: 0.168396	valid_1's l1: 0.169093
[24]	training's l1: 0.167786	valid_1's l1: 0.168506
[25]	training's l1: 0.167303	valid_1's l1: 0.168032
[26]	training's l1: 0.166886	valid_1's l1: 0.167634
[27]	training's l1: 0.166477	valid_1's l1: 0.167235
[28]	training's l1: 0.166113	valid_1's l1: 0.166898
[29]	training's l1: 0.165898	valid_1's l1: 0.166699
[30]	training's l1: 0.16564	valid_1's l1: 0.166459
[31]	training's l1: 0.165412	valid_1's l1: 0.166259
[32]	training'

[69]	training's l1: 0.163505	valid_1's l1: 0.165704
[70]	training's l1: 0.163418	valid_1's l1: 0.165646
[71]	training's l1: 0.163654	valid_1's l1: 0.166034
[72]	training's l1: 0.16395	valid_1's l1: 0.166489
[73]	training's l1: 0.163521	valid_1's l1: 0.165924
[74]	training's l1: 0.163811	valid_1's l1: 0.166163
[75]	training's l1: 0.163549	valid_1's l1: 0.16602
[76]	training's l1: 0.163475	valid_1's l1: 0.16602
[77]	training's l1: 0.163476	valid_1's l1: 0.166044
[78]	training's l1: 0.163379	valid_1's l1: 0.165931
[79]	training's l1: 0.163534	valid_1's l1: 0.166063
[80]	training's l1: 0.163353	valid_1's l1: 0.165941
[81]	training's l1: 0.163407	valid_1's l1: 0.165995
[82]	training's l1: 0.163301	valid_1's l1: 0.165926
[83]	training's l1: 0.163614	valid_1's l1: 0.166078
[84]	training's l1: 0.163243	valid_1's l1: 0.165877
[85]	training's l1: 0.163266	valid_1's l1: 0.165915
[86]	training's l1: 0.163201	valid_1's l1: 0.165873
[87]	training's l1: 0.163588	valid_1's l1: 0.166258
[88]	training's

[21]	training's l1: 0.170049	valid_1's l1: 0.169582
[22]	training's l1: 0.169213	valid_1's l1: 0.168765
[23]	training's l1: 0.16853	valid_1's l1: 0.168093
[24]	training's l1: 0.167934	valid_1's l1: 0.167523
[25]	training's l1: 0.167433	valid_1's l1: 0.16704
[26]	training's l1: 0.166979	valid_1's l1: 0.166598
[27]	training's l1: 0.166625	valid_1's l1: 0.166252
[28]	training's l1: 0.16631	valid_1's l1: 0.16595
[29]	training's l1: 0.166037	valid_1's l1: 0.165688
[30]	training's l1: 0.165828	valid_1's l1: 0.165489
[31]	training's l1: 0.1656	valid_1's l1: 0.165291
[32]	training's l1: 0.165413	valid_1's l1: 0.165118
[33]	training's l1: 0.165228	valid_1's l1: 0.164939
[34]	training's l1: 0.165089	valid_1's l1: 0.164828
[35]	training's l1: 0.165021	valid_1's l1: 0.164763
[36]	training's l1: 0.164898	valid_1's l1: 0.164657
[37]	training's l1: 0.164815	valid_1's l1: 0.16459
[38]	training's l1: 0.164711	valid_1's l1: 0.164494
[39]	training's l1: 0.164632	valid_1's l1: 0.164433
[40]	training's l1:

In [46]:
y_pred=np.mean(y_pred_totlgb,0)
y_pred

array([0.77955824, 0.67123499, 0.10704709, ..., 0.03661029, 0.00782681,
       0.04354306])

In [47]:
sub=pd.read_csv('sample_submission.csv')

In [48]:
sub.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0
1,5335efd940280b82143272275637d1e65d37eadb,0
2,a581f4fa08677c26f83f643248c667e241043086,0
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0


In [49]:
sub['CHURN'] = y_pred
sub.head()

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.7795582
1,5335efd940280b82143272275637d1e65d37eadb,0.671235
2,a581f4fa08677c26f83f643248c667e241043086,0.1070471
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0.001003956
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,8.22638e-07


In [50]:
sub.to_csv('SUBBBB5.csv', index=False)

In [51]:
loglosss=np.mean(errlgb,0)
loglosss

0.2657105763519974