In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import VarianceThreshold


In [2]:
model_data = pd.read_csv('model_data_m1.csv')

In [6]:
model_data.drop(columns=['AccountID','Marital_Status', 'CC_Agent_Score', 'Payment', 'Login_device', 'Gender', 'coupon_used_for_payment', 'CC_Contacted_LY', 'rev_growth_yoy', 'rev_per_month'], inplace=True)

In [8]:
from sklearn.preprocessing import OrdinalEncoder

oe_segment = OrdinalEncoder()
model_data[['account_segment']] = oe_segment.fit_transform(model_data[['account_segment']])


In [9]:
model_data

Unnamed: 0,Churn,Tenure,City_Tier,Service_Score,Account_user_count,account_segment,Complain_ly,Day_Since_CC_connect,cashback
0,1,4.0,3.0,3.0,3.0,4.0,1.0,5.0,160.00
1,1,0.0,1.0,3.0,4.0,3.0,1.0,0.0,121.00
2,1,0.0,1.0,2.0,4.0,3.0,1.0,3.0,196.24
3,1,0.0,3.0,2.0,4.0,4.0,0.0,3.0,134.00
4,1,0.0,1.0,2.0,3.0,3.0,0.0,3.0,130.00
...,...,...,...,...,...,...,...,...,...
11255,0,10.0,1.0,3.0,2.0,4.0,0.0,4.0,154.00
11256,0,13.0,1.0,3.0,5.0,0.0,0.0,8.0,227.00
11257,0,1.0,1.0,3.0,2.0,4.0,1.0,4.0,191.00
11258,0,23.0,3.0,4.0,5.0,4.0,0.0,9.0,180.00


In [3]:
X = model_data.drop(columns=['Churn'])
Y = model_data['Churn']

In [10]:
train_data, test_data = train_test_split(model_data, test_size=0.2, random_state=42) 


In [13]:
train_data.to_csv('C:\ML Project\deploy-sagemaker/train_data.csv', index=False)
test_data.to_csv('C:\ML Project\deploy-sagemaker/test_data.csv', index=False)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 

In [21]:
X_train.drop(columns=['AccountID'], inplace=True)
X_test.drop(columns=['AccountID'], inplace=True)

In [22]:
X_train

Unnamed: 0,Tenure,City_Tier,CC_Contacted_LY,Payment,Gender,Service_Score,Account_user_count,account_segment,CC_Agent_Score,Marital_Status,rev_per_month,Complain_ly,rev_growth_yoy,coupon_used_for_payment,Day_Since_CC_connect,cashback,Login_device
5333,10.0,1.0,12.0,Debit Card,Female,3.0,4.0,HNI,5.0,Married,,0.0,13.0,2,3.0,205.0,Mobile
11194,16.0,1.0,37.0,Debit Card,Male,3.0,5.0,Super,3.0,Single,,0.0,13.0,4,8.0,171.0,Computer
9834,4.0,1.0,14.0,Credit Card,Female,3.0,4.0,Super,1.0,Single,6.0,0.0,25.0,2,8.0,166.0,Computer
6466,8.0,1.0,15.0,Debit Card,Male,2.0,3.0,Super,3.0,Single,2.0,1.0,11.0,1,1.0,148.0,Computer
7917,9.0,1.0,21.0,Cash on Delivery,Female,3.0,3.0,HNI,3.0,Married,6.0,0.0,14.0,1,7.0,172.0,Mobile
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.0,1.0,28.0,Credit Card,Male,2.0,4.0,Regular Plus,2.0,Single,6.0,1.0,14.0,0,1.0,125.0,Mobile
5191,10.0,3.0,16.0,E wallet,Male,3.0,4.0,HNI,1.0,Married,6.0,0.0,12.0,5,4.0,226.0,Computer
5390,1.0,1.0,36.0,Credit Card,Male,4.0,4.0,Regular Plus,5.0,Married,3.0,0.0,14.0,2,3.0,164.0,Mobile
860,0.0,1.0,28.0,Credit Card,Male,3.0,3.0,Super,3.0,Divorced,2.0,1.0,15.0,3,7.0,162.0,Mobile


In [23]:
from sklearn.preprocessing import LabelEncoder
le_marital = LabelEncoder()
X_train['Marital_Status'] = le_marital.fit_transform(X_train['Marital_Status'])
X_test['Marital_Status'] = le_marital.transform(X_test['Marital_Status'])


In [24]:

from sklearn.preprocessing import LabelEncoder
le_marital = LabelEncoder()
X_train['Payment'] = le_marital.fit_transform(X_train['Payment'])
X_test['Payment'] = le_marital.transform(X_test['Payment'])


In [25]:
from sklearn.preprocessing import OrdinalEncoder

oe_segment = OrdinalEncoder()
X_train[['account_segment']] = oe_segment.fit_transform(X_train[['account_segment']])
X_test[['account_segment']] = oe_segment.transform(X_test[['account_segment']])



In [26]:
device_map = {'Computer': 1, 'Mobile': 0}
device_map_gender = {'Male':1, 'Female': 0}
X_train['Login_device'] = X_train['Login_device'].map(device_map)
X_test['Login_device'] = X_test['Login_device'].map(device_map)
X_train['Gender'] = X_train['Gender'].map(device_map_gender)
X_test['Gender'] = X_test['Gender'].map(device_map_gender)


In [33]:
# X_train.drop(columns=['rev_per_month'], inplace=True)
# X_test.drop(columns=['rev_per_month'], inplace=True)
X_train.drop(columns=['rev_growth_yoy'], inplace=True)
X_test.drop(columns=['rev_growth_yoy'], inplace=True)

In [34]:
X_train.isnull().sum()

Tenure                     0
City_Tier                  0
CC_Contacted_LY            0
Payment                    0
Gender                     0
Service_Score              0
Account_user_count         0
account_segment            0
CC_Agent_Score             0
Marital_Status             0
Complain_ly                0
coupon_used_for_payment    0
Day_Since_CC_connect       0
cashback                   0
Login_device               0
dtype: int64

In [35]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

mi_scores = mutual_info_classif(
    X_train,
    y_train,
    random_state=42
)

mi_df = pd.DataFrame(
    {'Features':X_train.columns,
    'MI_Score': mi_scores}
).sort_values(by='MI_Score', ascending=False)



In [36]:
# X_train_mi = X_train.drop(columns=['Marital_Status', 'CC_Agent_Score', 'Payment', 'Login_device', 'Gender', 'coupon_used_for_payment', 'CC_Contacted_LY', 'rev_per_month', 'rev_growth_yoy'])
# X_test_mi = X_test.drop(columns=['Marital_Status', 'CC_Agent_Score', 'Payment', 'Login_device', 'Gender', 'coupon_used_for_payment', 'CC_Contacted_LY', 'rev_per_month', 'rev_growth_yoy'])

X_train_mi = X_train.drop(columns=['Marital_Status', 'CC_Agent_Score', 'Payment', 'Login_device', 'Gender', 'coupon_used_for_payment', 'CC_Contacted_LY'])
X_test_mi = X_test.drop(columns=['Marital_Status', 'CC_Agent_Score', 'Payment', 'Login_device', 'Gender', 'coupon_used_for_payment', 'CC_Contacted_LY'])



In [37]:
mi_df

Unnamed: 0,Features,MI_Score
0,Tenure,0.123853
7,account_segment,0.02747
10,Complain_ly,0.024964
12,Day_Since_CC_connect,0.022768
13,cashback,0.021041
9,Marital_Status,0.017409
6,Account_user_count,0.011558
8,CC_Agent_Score,0.009044
3,Payment,0.00664
4,Gender,0.005444


In [71]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

importances = pd.Series(rf.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False)


Tenure                     0.254264
cashback                   0.078101
CC_Contacted_LY            0.071229
Day_Since_CC_connect       0.070195
Complain_ly                0.061698
rev_growth_yoy             0.061407
CC_Agent_Score             0.058204
rev_per_month              0.056878
Payment                    0.050076
account_segment            0.041230
Account_user_count         0.037556
Marital_Status             0.036366
coupon_used_for_payment    0.032241
City_Tier                  0.028496
Gender                     0.021207
Service_Score              0.020691
Login_device               0.020161
dtype: float64

In [None]:
from sklearn.metrics import accuracy_score

params = {
    'criterion':['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, None],
    'max_features':['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10, 20],
    'class_weight':[None, 'balanced']
}

model = DecisionTreeClassifier(random_state=42)

cv = GridSearchCV(model, param_grid=params, cv=10, scoring='accuracy', n_jobs=1)

cv.fit(X_train, y_train)

print("Best Parameters:", cv.best_params_)

best_model_tree = cv.best_estimator_

y_pred = best_model_tree.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)





In [138]:
from sklearn.metrics import accuracy_score, confusion_matrix

params = {
    'criterion':['gini', 'entropy'],
    'max_depth': [3, 6, 9, 12, 15, None],
    'max_features':['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10, 20],
    'class_weight':[None, 'balanced']
}

model = DecisionTreeClassifier(random_state=42)

cv_mi = GridSearchCV(model, param_grid=params, cv=10, scoring='accuracy', n_jobs=1)

cv_mi.fit(X_train_mi, y_train)

print("Best Parameters:", cv_mi.best_params_)

best_model_tree = cv_mi.best_estimator_

y_pred = best_model_tree.predict(X_test_mi)
y_pred_train = best_model_tree.predict(X_train_mi)
acc = accuracy_score(y_test, y_pred)
acc_tr = accuracy_score(y_train, y_pred_train)
print("Test Accuracy:", acc)
print()
print('Train Accuracy: ', acc_tr)
print()
print(confusion_matrix(y_test, y_pred))


Best Parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2}
Test Accuracy: 0.9436056838365897

Train Accuracy:  0.9946714031971581

[[1796   60]
 [  67  329]]


In [39]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_train_mi, y_train = smote.fit_resample(X_train_mi, y_train)
print(y_train.value_counts()) 


Churn
0    7508
1    7508
Name: count, dtype: int64


In [41]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

cv_rf_mi = GridSearchCV(
    rf,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

cv_rf_mi.fit(X_train_mi, y_train)

print("Best Parameters:", cv_rf_mi.best_params_)


best_model_rf = cv_rf_mi.best_estimator_

y_pred_test = best_model_rf.predict(X_test_mi)
y_pred_train = best_model_rf.predict(X_train_mi)

acc_mi_test = accuracy_score(y_test, y_pred_test)
acc_mi_train = accuracy_score(y_train, y_pred_train)
print('Train Accuracy: ', acc_mi_train)
print()
print("Test Accuracy:", acc_mi_test)


Best Parameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
Train Accuracy:  0.9891449120937666

Test Accuracy: 0.9400532859680284


In [42]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred_test))

[[1802   54]
 [  81  315]]


In [43]:
print(recall_score(y_test, y_pred_test))

0.7954545454545454


In [None]:
X_train_mi

Unnamed: 0,Tenure,City_Tier,Service_Score,Account_user_count,account_segment,Complain_ly,Day_Since_CC_connect,cashback
0,10.000000,1.000000,3.000000,4.000000,0.000000,0.0,3.000000,205.000000
1,16.000000,1.000000,3.000000,5.000000,4.000000,0.0,8.000000,171.000000
2,4.000000,1.000000,3.000000,4.000000,4.000000,0.0,8.000000,166.000000
3,8.000000,1.000000,2.000000,3.000000,4.000000,1.0,1.000000,148.000000
4,9.000000,1.000000,3.000000,3.000000,0.000000,0.0,7.000000,172.000000
...,...,...,...,...,...,...,...,...
15011,0.000000,1.602517,2.000000,4.000000,3.000000,0.0,0.000000,126.000000
15012,0.000000,1.000000,2.000000,3.000000,3.000000,1.0,4.000000,127.000000
15013,1.000000,1.000000,3.898871,4.000000,4.000000,0.0,1.898871,173.898871
15014,1.000000,1.000000,4.000000,5.000000,3.824960,1.0,1.000000,149.000000


In [143]:
from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(rf, X_test_mi, y_test, cv=5).mean()
print("CV Accuracy:", cv_score)


CV Accuracy: 0.901429908844543


In [144]:
df = pd.read_csv('C:\\ML Project\\artifacts\\train.csv')

In [8]:
from sklearn.feature_selection import VarianceThreshold

num_cols = ['Tenure', 'rev_per_month', 'rev_growth_yoy', 'cashback']

variance_selector = VarianceThreshold(threshold=0.3)
X_train[num_cols] = variance_selector.fit_transform(X_train[num_cols])
X_test[num_cols] = variance_selector.transform(X_test[num_cols])

In [9]:
mask = variance_selector.get_support()
print(mask)


[ True  True  True  True]


In [10]:
X_train.drop(columns=['Login_device', 'Gender', 'Service_Score', 'City_Tier'], inplace=True)
X_test.drop(columns=['Login_device', 'Gender', 'Service_Score', 'City_Tier'], inplace=True)

In [11]:
X_train.shape

(9233, 13)

In [12]:
X_train

Unnamed: 0,Tenure,CC_Contacted_LY,Payment,Account_user_count,account_segment,CC_Agent_Score,Marital_Status,rev_per_month,Complain_ly,rev_growth_yoy,coupon_used_for_payment,Day_Since_CC_connect,cashback
10741,15.0,21.0,1,4.0,4.0,4.0,2,5.0,0.0,15.0,4,10.0,187.0
7768,1.0,23.0,2,3.0,3.0,3.0,1,5.0,0.0,18.0,0,2.0,122.0
5589,13.0,31.0,2,4.0,4.0,1.0,2,9.0,0.0,14.0,1,8.0,166.0
6280,16.0,33.0,1,4.0,0.0,2.0,1,6.0,0.0,13.0,1,0.0,239.0
7602,25.0,36.0,1,4.0,6.0,1.0,1,5.0,0.0,23.0,0,0.0,243.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.0,28.0,1,4.0,3.0,2.0,2,6.0,1.0,14.0,0,1.0,125.0
5191,10.0,16.0,3,4.0,0.0,1.0,1,6.0,0.0,12.0,5,4.0,226.0
5390,1.0,36.0,1,4.0,3.0,5.0,1,3.0,0.0,14.0,2,3.0,164.0
860,0.0,28.0,1,3.0,4.0,3.0,0,2.0,1.0,15.0,3,7.0,162.0


In [102]:
from sklearn.metrics import accuracy_score

params = {
    'criterion':['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, None],
    'max_features':['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10, 20],
    'class_weight':[None, 'balanced']
}

model = DecisionTreeClassifier(random_state=42)

cv = GridSearchCV(model, param_grid=params, cv=10, scoring='accuracy', n_jobs=1)

cv.fit(X_train, y_train)

print("Best Parameters:", cv.best_params_)

best_model_tree = cv.best_estimator_

y_pred = best_model_tree.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)





Best Parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 30, 'max_features': None, 'min_samples_split': 2}
Test Accuracy: 0.9363591514553528


In [None]:
from sklearn.neighbors import KNeighborsClassifier
accuracy=[]

for i in range(1, 31):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy.append(acc)

In [16]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 400, 500, 650, 700],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'max_features': ['sqrt', 'log2']
}

cv_rf = GridSearchCV(
    rf,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

cv_rf.fit(X_train, y_train)

print("Best Parameters:", cv_rf.best_params_)

best_model_rf = cv_rf.best_estimator_

y_pred_test = best_model_rf.predict(X_test)
y_pred_train = best_model_rf.predict(X_train)

acc_rf_test = accuracy_score(y_test, y_pred_test)
acc_rf_train = accuracy_score(y_train, y_pred_train)
print('Train Accuracy: ', acc_rf_train)
print()
print("Test Accuracy:", acc_rf_test)


Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 700}
Train Accuracy:  1.0

Test Accuracy: 0.9649728663048841


In [161]:
df = pd.read_csv('C:\\ML Project\\artifacts\\train.csv')
df.isnull().sum()

AccountID                    0
Churn                        0
Tenure                     175
City_Tier                   74
CC_Contacted_LY             84
Payment                      0
Gender                      88
Service_Score               74
Account_user_count          87
account_segment             82
CC_Agent_Score              95
Marital_Status             168
rev_per_month              627
Complain_ly                273
rev_growth_yoy               2
coupon_used_for_payment      0
Day_Since_CC_connect       291
cashback                   375
Login_device                 0
dtype: int64

In [168]:
df.drop(columns=['Marital_Status', 'CC_Agent_Score', 'Payment', 'Login_device', 'Gender', 'coupon_used_for_payment', 'CC_Contacted_LY', 'rev_per_month', 'rev_growth_yoy'], inplace=True)

In [169]:
df.isnull().sum()

AccountID                 0
Churn                     0
Tenure                  175
City_Tier                74
Service_Score            74
Account_user_count       87
account_segment          82
Complain_ly             273
Day_Since_CC_connect    291
cashback                375
dtype: int64

In [175]:
df

Unnamed: 0,AccountID,Churn,Tenure,City_Tier,Service_Score,Account_user_count,account_segment,Complain_ly,Day_Since_CC_connect,cashback
0,25333,0,10.0,1.0,3.0,4,HNI,,3.0,205.0
1,31194,0,16.0,1.0,3.0,5,Super,,8.0,171.0
2,29834,0,4.0,1.0,3.0,4,Super,0.0,8.0,166.0
3,26466,0,8.0,1.0,2.0,3,Super,1.0,1.0,148.0
4,27917,0,9.0,1.0,3.0,3,HNI,0.0,7.0,172.0
...,...,...,...,...,...,...,...,...,...,...
9003,25734,1,0.0,1.0,2.0,4,Regular Plus,1.0,1.0,125.0
9004,25191,1,10.0,3.0,3.0,4,HNI,0.0,4.0,226.0
9005,25390,0,1.0,1.0,4.0,4,Regular Plus,0.0,3.0,164.0
9006,20860,1,0.0,1.0,3.0,3,Super,1.0,7.0,162.0


In [178]:
(X_train_mi=='@').any()

Tenure                  False
City_Tier               False
Service_Score           False
Account_user_count      False
account_segment         False
Complain_ly             False
Day_Since_CC_connect    False
cashback                False
dtype: bool