In [None]:
import pandas as pd
import numpy as np
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn import metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import accuracy_score, classification_report
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
os.chdir('/Sandbox/Churn_30_days_prepaid')

In [None]:
#importing data
df01=pd.read_csv('/Sandbox/Churn_30_days_prepaid/datasets/Churn_30_days_20230424.csv')

In [None]:
#handeling catagorical variables
df01['handset_category']=df01['handset_category'].map({'3g':1, 'no_smartphone':0, '4g':1,'0':0, '5g':1, 'unknown':0})
df01['site_type']=df01['site_type'].map({'2G+4G':1, '2G':0, '4G':1,'unknown':0})

In [None]:
#dropping noninportant attributes
df01 = df01.drop(['inc', 'LAST_CALL_DATE', 'CUSTYPE', 'activation_date'], axis=1)

In [None]:
#adding column => 'is_mulitsim'
df01['is_multisim']=df01['sim_number'].map({1.000:0, 2.000:1, 3.000:1, 4.000:1})

In [None]:
#handling null values
df01['is_multisim'] = df01['is_multisim'].fillna(0)
df01['days_since_last_recharge'] = df01['days_since_last_recharge'].fillna(91)
df01['Days_of_silence'] = df01['Days_of_silence'].fillna(df01['Days_of_silence'].mean())
df01['days_of_rec'] = df01['days_of_rec'].fillna(0)
df01['AON'] = df01['AON'].fillna(df01['AON'].mean())

In [None]:
# checking correlation with churn_flag
rr = df01.corr()
sorted_corrs = rr['churn_flag'].abs().sort_values(ascending = False)
print(sorted_corrs)

In [None]:
#dropping attributes with low correlation
df01 = df01.drop(['sim_number',
                  'sms_revenue',
                  'site_type',
                  'REV_SMS_A2A_RENT',
                  'REV_SMS_A2A_APP',
                  'REV_SMS_A2A',
                  'device_change_count',
                  'REV_SMS_A2O_IDD',
                  'REV_SMS_A2O',
                  'REV_SMS_A2O_RENT',
                  'IDD_IC_MoU',
                  'IDD_OG_MoU'], axis=1)

In [None]:
# defining dependent and independent variables
y = df01['churn_flag']
X = df01.copy()
X = X.drop(['msisdn', 'churn_flag'], axis=1)

In [None]:
#checking variance infalation factor
def calc_VIF(X):
    vif = pd.DataFrame()
    vif['variables']=X.columns
    vif["VIF"]=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
    return(vif)
calc_VIF(X)

In [None]:
# Droping attributes with high VIF
X = X.drop(['rc_value',
           'tot_rev',
           'account_state',
           'days_of_rec'], axis=1)

In [None]:
# again checking variance infalation factor
def calc_VIF(X):
    vif = pd.DataFrame()
    vif['variables']=X.columns
    vif["VIF"]=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
    return(vif)
calc_VIF(X)

In [None]:
y.value_counts()

In [None]:
#spliting training and testing dataset (X_test1, y_test1 will be used for validation)
X_1, X_test1, y_1, y_test1 = train_test_split(X, y,stratify=y, test_size=0.2)

In [None]:
#upsampling the dataset
oversample = SMOTE()
X_up, y_up = oversample.fit_resample(X_1, y_1)

In [None]:
#checking if data balanced or not
y_up.value_counts()

In [None]:
#splitting upsampled dadaset
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_up, y_up, stratify=y_up, test_size=0.2, random_state=42)

In [None]:
# XG boost
clf = XGBClassifier(
 learning_rate =0.05,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27, random_state=42)


xgb=clf.fit(X_train_up,y_train_up)


prediction_xgb=xgb.predict(X_test_up)
print(metrics.classification_report(y_test_up, prediction_xgb))

In [None]:
xgb_prediction03=xgb.predict(X_test1)
print(metrics.classification_report(y_test1, xgb_prediction03))

In [None]:
#dt=DecisionTreeClassifier(min_samples_split = 10,min_samples_leaf=15,max_features='sqrt',
#                                max_depth=45,random_state=45)    

# rf=RandomForestClassifier(n_estimators=100,max_depth=26,max_features='sqrt',min_samples_split =20
#                                      ,min_samples_leaf=20,bootstrap=True,random_state=145)
#lr=LogisticRegression(random_state=seed)
#sgd = SGDClassifier()

In [None]:
# Training Random Forest Models
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=500,max_depth=45,max_features='sqrt',min_samples_split =10,min_samples_leaf=10,bootstrap=True,random_state=45)
rf2=clf.fit(X_train_up,y_train_up)

prediction=rf2.predict(X_test_up)
#Measuring accuracy on Testing Data
print(metrics.classification_report(y_test_up, prediction))



In [None]:
rf_prediction03=rf2.predict(X_test1)
print(metrics.classification_report(y_test1, rf_prediction03))

In [None]:
prediction02=rf2.predict_proba(X_test1)
#changing threshold value from 0.5 to 0.65
prediction02 = prediction02[:,1]
prediction = [0 if val<0.53 else 1 for val in prediction02]
print(metrics.classification_report(y_test1, prediction))

In [None]:
fpr,tpr,thresholds=metrics.roc_curve(y_test1, xgb_prediction03)

def plot_roc_curve(fpr,tpr,label=None):
    plt.plot(fpr,tpr,linewidth=2,label=label)
    plt.plot([0,1],[0,1],'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
plot_roc_curve(fpr,tpr)
plt.show()

In [None]:
AB=xgb.predict(X)
dp1=pd.DataFrame(AB)

h = xgb.predict_proba(X)
h = h[:,1]
dp=pd.DataFrame(h)

In [None]:
# Generating final file:
churn_df=pd.DataFrame()
churn_df['Churn_Group'] = pd.cut(dp[0], bins=[-0.1,0.2,0.4,0.6,0.8,1], labels=["V Low","Low", "Medium", "High","V High"])
churn_df['Quantile_cut']= pd.cut(dp[0], bins=[-0.1,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1],
                      labels=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'])

In [None]:
#Generating graphs
s = churn_df['Quantile_cut'].value_counts().sort_index()
plt.figure(figsize=(15,8))
ax=s.plot.bar(width=.9) 

plt.xticks(rotation=0)
for i, v in s.reset_index().iterrows():
    ax.text(i, v.Quantile_cut + 0.2 , v.Quantile_cut, color='black')

In [None]:
#Feature Importance
df_f = pd.DataFrame(xgb.feature_importances_, columns=["Importance"])
df_f['Labels'] = X.columns
df_f.sort_values("Importance", inplace=True, ascending=True)
df_f.set_index('Labels').sort_values(by='Importance', ascending=True)[-20:].plot(kind='barh',figsize=(18,9), width=0.95)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(linewidth=.2, alpha=0.3, color='lightgrey')
plt.tight_layout()
plt.show()

In [None]:
s = churn_df['Churn_Group'].value_counts().sort_index()

plt.figure(figsize=(10,8))
ax=s.plot.bar(width=.9) 
#ax = sns.countplot(x="class", data=data)
plt.xticks(rotation=0)
for i, v in s.reset_index().iterrows():
    ax.text(i, v.Churn_Group + 0.2 , v.Churn_Group, color='black')

In [None]:
# Save the Modle to file in the current working directory
import pickle
Pkl_Filename = "Model_churn_30_days.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(xgb, file)

In [None]:
X.info()