In [81]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [82]:
df=pd.read_excel("/kaggle/input/cust-churn-task/customer_churn_large_dataset.xlsx")

In [83]:
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [84]:
#checking for missing values
df.isnull().sum()  
#no missing values

CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64

In [85]:
#checking data imbalance if any
df.Churn.value_counts(normalize=True)
#no data imbalance

0    0.50221
1    0.49779
Name: Churn, dtype: float64

In [86]:
df.info()
#stats

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB


In [87]:
# checking for outliers
df.describe() 
#not much outliers

Unnamed: 0,CustomerID,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,50000.5,44.02702,12.4901,65.053197,274.39365,0.49779
std,28867.657797,15.280283,6.926461,20.230696,130.463063,0.499998
min,1.0,18.0,1.0,30.0,50.0,0.0
25%,25000.75,31.0,6.0,47.54,161.0,0.0
50%,50000.5,44.0,12.0,65.01,274.0,0.0
75%,75000.25,57.0,19.0,82.64,387.0,1.0
max,100000.0,70.0,24.0,100.0,500.0,1.0


In [88]:
# Identifiers
del df['CustomerID']
del df['Name']

In [89]:
df.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,Male,Los Angeles,17,73.36,236,0
1,62,Female,New York,1,48.76,172,0
2,24,Female,Los Angeles,5,85.47,460,0
3,36,Female,Miami,3,97.94,297,1
4,46,Female,Miami,19,58.14,266,0


In [90]:
df['Subscription_Length_Months'].value_counts()

20    4303
22    4267
1     4247
16    4229
2     4228
14    4213
7     4211
11    4200
6     4184
5     4171
18    4171
12    4155
21    4154
13    4154
10    4151
3     4136
9     4134
15    4122
24    4113
8     4106
19    4106
4     4098
23    4083
17    4064
Name: Subscription_Length_Months, dtype: int64

In [91]:
df['Location'].value_counts()

Houston        20157
Los Angeles    20041
Miami          20031
Chicago        19958
New York       19813
Name: Location, dtype: int64

In [92]:
df['Gender'].value_counts()

Female    50216
Male      49784
Name: Gender, dtype: int64

In [93]:
le1 = LabelEncoder()
le2 = LabelEncoder()

In [94]:
df['Gender'] = le1.fit_transform(df['Gender'])
df['Location'] = le1.fit_transform(df['Location'])

In [95]:
df

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,1,2,17,73.36,236,0
1,62,0,4,1,48.76,172,0
2,24,0,2,5,85.47,460,0
3,36,0,3,3,97.94,297,1
4,46,0,3,19,58.14,266,0
...,...,...,...,...,...,...,...
99995,33,1,1,23,55.13,226,1
99996,62,0,4,19,61.65,351,0
99997,64,1,0,17,96.11,251,1
99998,51,0,4,20,49.25,434,1


In [96]:
df.describe()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,44.02702,0.49784,1.99584,12.4901,65.053197,274.39365,0.49779
std,15.280283,0.499998,1.411638,6.926461,20.230696,130.463063,0.499998
min,18.0,0.0,0.0,1.0,30.0,50.0,0.0
25%,31.0,0.0,1.0,6.0,47.54,161.0,0.0
50%,44.0,0.0,2.0,12.0,65.01,274.0,0.0
75%,57.0,1.0,3.0,19.0,82.64,387.0,1.0
max,70.0,1.0,4.0,24.0,100.0,500.0,1.0


In [97]:
df.corr()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
Age,1.0,0.000832,-0.005889,0.003382,0.00111,0.001927,0.001559
Gender,0.000832,1.0,-0.003314,0.00032,0.002239,-0.001385,0.002121
Location,-0.005889,-0.003314,1.0,-0.001768,0.003716,0.002834,0.006405
Subscription_Length_Months,0.003382,0.00032,-0.001768,1.0,-0.005294,-0.002203,0.002328
Monthly_Bill,0.00111,0.002239,0.003716,-0.005294,1.0,0.003187,-0.000211
Total_Usage_GB,0.001927,-0.001385,0.002834,-0.002203,0.003187,1.0,-0.002842
Churn,0.001559,0.002121,0.006405,0.002328,-0.000211,-0.002842,1.0


In [98]:
#No correlated features

In [99]:
#splitting the dataset into train and validation
y_df = df.pop('Churn')
# x_train_df, x_valid_df,y_train_df,y_val_df = train_test_split(x_df,y_df,test_size =0.2,random_state=42,stratify =y_df)

In [100]:
df['Location']

0        2
1        4
2        2
3        3
4        3
        ..
99995    1
99996    4
99997    0
99998    4
99999    2
Name: Location, Length: 100000, dtype: int64

In [101]:
df.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,1,2,17,73.36,236
1,62,0,4,1,48.76,172
2,24,0,2,5,85.47,460
3,36,0,3,3,97.94,297
4,46,0,3,19,58.14,266


In [102]:
x_train, x_cv, y_train, y_cv = train_test_split(df,y_df, test_size = 0.2, random_state=42)

In [103]:
x_train

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
75220,54,0,4,5,84.50,205
48955,28,1,4,24,82.06,239
44966,57,1,0,12,52.29,62
13568,19,1,1,19,32.57,173
92727,56,0,3,8,33.52,314
...,...,...,...,...,...,...
6265,35,1,3,21,67.33,235
54886,56,1,0,13,85.40,347
76820,69,1,1,2,76.24,321
860,55,1,0,12,89.19,315


In [104]:
scaler = StandardScaler()

In [105]:
scaler.fit(x_train)

In [106]:
x_train = scaler.transform(x_train)

In [107]:
x_train.shape

(80000, 6)

In [108]:
def dfr(arr):
    return pd.DataFrame(arr)

In [109]:
dfr(x_train)

Unnamed: 0,0,1,2,3,4,5
0,0.653447,-0.995535,1.417035,-1.082728,0.960255,-0.533774
1,-1.048276,1.004485,1.417035,1.663882,0.839624,-0.273257
2,0.849799,1.004485,-1.418063,-0.070819,-0.632174,-1.629476
3,-1.637334,1.004485,-0.709288,0.941090,-1.607109,-0.778966
4,0.784348,-0.995535,0.708261,-0.649053,-1.560142,0.301412
...,...,...,...,...,...,...
79995,-0.590120,1.004485,0.708261,1.230207,0.111388,-0.303906
79996,0.784348,1.004485,-1.418063,0.073739,1.004750,0.554266
79997,1.635210,1.004485,-0.709288,-1.516404,0.551889,0.355047
79998,0.718898,1.004485,-1.418063,-0.070819,1.192123,0.309074


In [110]:
x_train[:,1] = le2.fit_transform(x_train[:,1])
x_train[:,2] = le2.fit_transform(x_train[:,2])

In [111]:
dfr(x_train)

Unnamed: 0,0,1,2,3,4,5
0,0.653447,0.0,4.0,-1.082728,0.960255,-0.533774
1,-1.048276,1.0,4.0,1.663882,0.839624,-0.273257
2,0.849799,1.0,0.0,-0.070819,-0.632174,-1.629476
3,-1.637334,1.0,1.0,0.941090,-1.607109,-0.778966
4,0.784348,0.0,3.0,-0.649053,-1.560142,0.301412
...,...,...,...,...,...,...
79995,-0.590120,1.0,3.0,1.230207,0.111388,-0.303906
79996,0.784348,1.0,0.0,0.073739,1.004750,0.554266
79997,1.635210,1.0,1.0,-1.516404,0.551889,0.355047
79998,0.718898,1.0,0.0,-0.070819,1.192123,0.309074


In [112]:
dfr(x_cv)

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
75721,48,0,1,11,88.48,492
80184,49,1,4,13,40.61,423
19864,31,0,2,5,33.01,276
76699,53,1,4,4,94.66,339
92991,23,0,2,24,82.21,304
...,...,...,...,...,...,...
32595,38,1,3,20,79.70,118
29313,53,1,2,12,96.75,363
37862,68,1,3,13,39.33,137
53421,34,1,3,13,95.14,498


In [113]:
x_cv = scaler.transform(x_cv)
x_cv[:,1] = le2.fit_transform(x_cv[:,1])
x_cv[:,2] = le2.fit_transform(x_cv[:,2])

In [114]:
dfr(x_cv)

Unnamed: 0,0,1,2,3,4,5
0,0.260742,0.0,1.0,-0.215378,1.157022,1.665292
1,0.326192,1.0,4.0,0.073739,-1.209620,1.136597
2,-0.851923,0.0,2.0,-1.082728,-1.585356,0.010246
3,0.587996,1.0,4.0,-1.227287,1.462554,0.492968
4,-1.375530,0.0,2.0,1.663882,0.847039,0.224789
...,...,...,...,...,...,...
19995,-0.393767,1.0,3.0,1.085648,0.722948,-1.200390
19996,0.587996,1.0,2.0,-0.070819,1.565882,0.676862
19997,1.569759,1.0,3.0,0.073739,-1.272902,-1.054807
19998,-0.655571,1.0,3.0,0.073739,1.486285,1.711266


In [115]:
xgb = XGBClassifier(n_estimators=500,random_state=42)

In [116]:
xgb.fit(x_train,y_train)

In [117]:
dfr(x_cv)

Unnamed: 0,0,1,2,3,4,5
0,0.260742,0.0,1.0,-0.215378,1.157022,1.665292
1,0.326192,1.0,4.0,0.073739,-1.209620,1.136597
2,-0.851923,0.0,2.0,-1.082728,-1.585356,0.010246
3,0.587996,1.0,4.0,-1.227287,1.462554,0.492968
4,-1.375530,0.0,2.0,1.663882,0.847039,0.224789
...,...,...,...,...,...,...
19995,-0.393767,1.0,3.0,1.085648,0.722948,-1.200390
19996,0.587996,1.0,2.0,-0.070819,1.565882,0.676862
19997,1.569759,1.0,3.0,0.073739,-1.272902,-1.054807
19998,-0.655571,1.0,3.0,0.073739,1.486285,1.711266


In [118]:
y_pred_train = xgb.predict(x_train)
y_pred_cv = xgb.predict(x_cv)

In [119]:
y_pred_train[0]

1

In [120]:
accuracy_score(y_train, y_pred_train)

0.8074

In [121]:
accuracy_score(y_cv, y_pred_cv)

0.4993

In [122]:
xgb.save_model('model.json')

In [123]:
!pip list

Package                                  Version         Editable project location
---------------------------------------- --------------- -------------------------
absl-py                                  1.4.0
accelerate                               0.20.3
access                                   1.1.9
affine                                   2.4.0
aiobotocore                              2.5.2
aiofiles                                 22.1.0
aiohttp                                  3.8.4
aiohttp-cors                             0.7.0
aioitertools                             0.11.0
aiorwlock                                1.3.0
aiosignal                                1.3.1
aiosqlite                                0.19.0
albumentations                           1.3.1
alembic                                  1.11.1
altair                                   5.0.1
annoy                                    1.17.3
ansiwrap                                 0.8.4
anyio                        

In [124]:
import pickle

In [125]:
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [126]:
pickle.dump(le1, open('le1.pkl', 'wb'))
pickle.dump(le2, open('le2.pkl', 'wb'))