In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Data Handling and Exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # Import for 3D plotting
import seaborn as sns
from pylab import rcParams
sns.set_style('darkgrid')
rcParams['figure.figsize'] = 8,8
import plotly.express as px
import os


# Data Preprocessing
from sklearn import preprocessing
#from feature_engine import imputation
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import scipy.stats as stats
from scipy.stats import skew


# Exploratory Data Analysis
#from pandas_profiling import ProfileReport


# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier



# Model Evaluation and Metrics
from sklearn.metrics import f1_score, accuracy_score, classification_report, roc_auc_score, roc_curve
#from yellowbrick.classifier import ClassificationReport, ROCAUC
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE


# Deployment and Monitoring
#import docker

# Saving Model
from joblib import dump
import pickle


#Other libraries
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cap/Train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cap/Test.csv')

In [4]:
train_df.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,7ee9e11e342e27c70455960acc80d3f91c1286d1,DAKAR,K > 24 month,20000.0,47.0,21602.0,7201.0,52.0,8835.0,3391.0,396.0,185.0,,,NO,62,On net 200F=Unlimited _call24H,30.0,0
1,50443f42bdc92b10388fc56e520e4421a5fa655c,,K > 24 month,,,,,,,,,,,,NO,3,,,0
2,da90b5c1a9b204c186079f89969aa01cb03c91b2,,K > 24 month,,,,,,,,,,,,NO,1,,,0
3,364ec1b424cdc64c25441a444a16930289a0051e,SAINT-LOUIS,K > 24 month,7900.0,19.0,7896.0,2632.0,25.0,9385.0,27.0,46.0,20.0,,2.0,NO,61,"Data:490F=1GB,7d",7.0,0
4,d5a5247005bc6d41d3d99f4ef312ebb5f640f2cb,DAKAR,K > 24 month,12350.0,21.0,12351.0,4117.0,29.0,9360.0,66.0,102.0,34.0,,,NO,56,All-net 500F=2000F;5d,11.0,0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1077024 entries, 0 to 1077023
Data columns (total 19 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   user_id         1077024 non-null  object 
 1   REGION          652687 non-null   object 
 2   TENURE          1077024 non-null  object 
 3   MONTANT         699139 non-null   float64
 4   FREQUENCE_RECH  699139 non-null   float64
 5   REVENUE         714669 non-null   float64
 6   ARPU_SEGMENT    714669 non-null   float64
 7   FREQUENCE       714669 non-null   float64
 8   DATA_VOLUME     547261 non-null   float64
 9   ON_NET          683850 non-null   float64
 10  ORANGE          629880 non-null   float64
 11  TIGO            432250 non-null   float64
 12  ZONE1           84898 non-null    float64
 13  ZONE2           68794 non-null    float64
 14  MRG             1077024 non-null  object 
 15  REGULARITY      1077024 non-null  int64  
 16  TOP_PACK        626129 non-null   ob

In [6]:
train_df.isnull().sum()

user_id                 0
REGION             424337
TENURE                  0
MONTANT            377885
FREQUENCE_RECH     377885
REVENUE            362355
ARPU_SEGMENT       362355
FREQUENCE          362355
DATA_VOLUME        529763
ON_NET             393174
ORANGE             447144
TIGO               644774
ZONE1              992126
ZONE2             1008230
MRG                     0
REGULARITY              0
TOP_PACK           450895
FREQ_TOP_PACK      450895
CHURN                   0
dtype: int64

In [7]:
for column in train_df.columns:
  print(column, ':', train_df[column].nunique())

user_id : 1077024
REGION : 14
TENURE : 8
MONTANT : 4357
FREQUENCE_RECH : 119
REVENUE : 31810
ARPU_SEGMENT : 14062
FREQUENCE : 91
DATA_VOLUME : 32459
ON_NET : 8202
ORANGE : 2674
TIGO : 1105
ZONE1 : 482
ZONE2 : 394
MRG : 1
REGULARITY : 62
TOP_PACK : 126
FREQ_TOP_PACK : 206
CHURN : 2


In [8]:
train_df.shape

(1077024, 19)

In [9]:
train_df = train_df.drop('user_id', axis = 1)
train_df

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,DAKAR,K > 24 month,20000.0,47.0,21602.0,7201.0,52.0,8835.0,3391.0,396.0,185.0,,,NO,62,On net 200F=Unlimited _call24H,30.0,0
1,,K > 24 month,,,,,,,,,,,,NO,3,,,0
2,,K > 24 month,,,,,,,,,,,,NO,1,,,0
3,SAINT-LOUIS,K > 24 month,7900.0,19.0,7896.0,2632.0,25.0,9385.0,27.0,46.0,20.0,,2.0,NO,61,"Data:490F=1GB,7d",7.0,0
4,DAKAR,K > 24 month,12350.0,21.0,12351.0,4117.0,29.0,9360.0,66.0,102.0,34.0,,,NO,56,All-net 500F=2000F;5d,11.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077019,,K > 24 month,,,,,,,,,,,,NO,16,,,0
1077020,TAMBACOUNDA,K > 24 month,2500.0,5.0,2500.0,833.0,5.0,0.0,15.0,77.0,,,,NO,34,All-net 500F=2000F;5d,2.0,0
1077021,,K > 24 month,,,,,,,,,,,,NO,3,,,1
1077022,,K > 24 month,600.0,1.0,600.0,200.0,1.0,591.0,11.0,37.0,5.0,1.0,,NO,16,All-net 600F= 3000F ;5d,1.0,0


In [10]:
# all nan values are to be filled with median on the
train_df['MONTANT'] = train_df['MONTANT'].fillna(train_df['MONTANT'].median())
train_df['FREQUENCE_RECH'] = train_df['FREQUENCE_RECH'].fillna(train_df['FREQUENCE_RECH'].median())
train_df['REVENUE'] = train_df['REVENUE'].fillna(train_df['REVENUE'].median())
train_df['ARPU_SEGMENT'] = train_df['ARPU_SEGMENT'].fillna(train_df['ARPU_SEGMENT'].median())
train_df['FREQUENCE'] = train_df['FREQUENCE'].fillna(train_df['FREQUENCE'].median())
train_df['DATA_VOLUME'] = train_df['DATA_VOLUME'].fillna(train_df['DATA_VOLUME'].median())
train_df['ON_NET'] = train_df['ON_NET'].fillna(train_df['ON_NET'].median())
train_df['ORANGE'] = train_df['ORANGE'].fillna(train_df['ORANGE'].median())
train_df['TIGO'] = train_df['TIGO'].fillna(train_df['TIGO'].median())
train_df['ZONE1'] = train_df['ZONE1'].fillna(train_df['ZONE1'].median())
train_df['ZONE2'] = train_df['ZONE2'].fillna(train_df['ZONE2'].median())
train_df['TOP_PACK']=train_df['TOP_PACK'].replace(np.nan ,train_df['TOP_PACK'].mode()[0])
train_df['REGION']=train_df['REGION'].replace(np.nan ,train_df['REGION'].mode()[0])
train_df['TENURE']=train_df['TENURE'].replace("nan" ,train_df['TENURE'].mode()[0])
train_df['FREQ_TOP_PACK'] = train_df['FREQ_TOP_PACK'].fillna(train_df['FREQ_TOP_PACK'].median())
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1077024 entries, 0 to 1077023
Data columns (total 18 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   REGION          1077024 non-null  object 
 1   TENURE          1077024 non-null  object 
 2   MONTANT         1077024 non-null  float64
 3   FREQUENCE_RECH  1077024 non-null  float64
 4   REVENUE         1077024 non-null  float64
 5   ARPU_SEGMENT    1077024 non-null  float64
 6   FREQUENCE       1077024 non-null  float64
 7   DATA_VOLUME     1077024 non-null  float64
 8   ON_NET          1077024 non-null  float64
 9   ORANGE          1077024 non-null  float64
 10  TIGO            1077024 non-null  float64
 11  ZONE1           1077024 non-null  float64
 12  ZONE2           1077024 non-null  float64
 13  MRG             1077024 non-null  object 
 14  REGULARITY      1077024 non-null  int64  
 15  TOP_PACK        1077024 non-null  object 
 16  FREQ_TOP_PACK   1077024 non-null  fl

In [None]:
# Hypothesis 1: Customers with higher average revenue per user (ARPU) are less likely to churn.
arpu_churned = train[train['CHURN'] == 1]['ARPU_SEGMENT']
arpu_retained = train[train['CHURN'] == 0]['ARPU_SEGMENT']
t_stat, p_value_arpu = ttest_ind(arpu_churned, arpu_retained, equal_var=False)

# Set significance level
alpha = 0.05

# Print results for each hypothesis
print("Hypothesis 1 Results:")
if p_value_arpu < alpha:
    print("Reject Null Hypothesis: Customers with higher ARPU are less likely to churn.")
else:
    print("Fail to Reject Null Hypothesis: No significant difference in ARPU between churned and retained customers.")

In [None]:
# Hypothesis 2: Customers who have been with the telecom company for a longer duration (higher regularity) are less likely to churn.
regularity_churned = train[train['CHURN'] == 1]['REGULARITY']
regularity_retained = train[train['CHURN'] == 0]['REGULARITY']
t_stat, p_value_regularity = ttest_ind(regularity_churned, regularity_retained, equal_var=False)

# Set significance level
alpha = 0.05

print("Hypothesis 2 Results:")
if p_value_regularity < alpha:
    print("Reject Null Hypothesis: Customers with higher regularity are less likely to churn.")
else:
    print("Fail to Reject Null Hypothesis: No significant difference in regularity between churned and retained customers.")
print()


In [None]:
# Hypothesis 3: Customers who frequently recharge (higher FREQUENCE_RECH) are less likely to churn.
freq_rech_churned = train[train['CHURN'] == 1]['FREQUENCE_RECH']
freq_rech_retained = train[train['CHURN'] == 0]['FREQUENCE_RECH']
t_stat, p_value_freq_rech = ttest_ind(freq_rech_churned, freq_rech_retained, equal_var=False)
# Set significance level
alpha = 0.05


print("Hypothesis 3 Results:")
if p_value_freq_rech < alpha:
    print("Reject Null Hypothesis: Customers with higher FREQUENCE_RECH are less likely to churn.")
else:
    print("Fail to Reject Null Hypothesis: No significant difference in FREQUENCE_RECH between churned and retained customers.")
print()


In [None]:
# Hypothesis 4: Customers who have a top service package (TOP_PACK) are less likely to churn.
contingency_table = pd.crosstab(train['TOP_PACK'], train['CHURN'])
chi2_stat, p_value_top_pack, _, _ = chi2_contingency(contingency_table)
# Set significance level
alpha = 0.05

print("Hypothesis 4 Results:")
if p_value_top_pack < alpha:
    print("Reject Null Hypothesis: Customers with top service packages are less likely to churn.")
else:
    print("Fail to Reject Null Hypothesis: No significant difference in churn based on service packages.")
print()

In [None]:
# Hypothesis 5: Customers with higher data usage (DATA_VOLUME) are less likely to churn.
data_volume_churned = train[train['CHURN'] == 1]['DATA_VOLUME']
data_volume_retained = train[train['CHURN'] == 0]['DATA_VOLUME']
t_stat, p_value_data_volume = ttest_ind(data_volume_churned, data_volume_retained, equal_var=False)

# Set significance level
alpha = 0.05

print("Hypothesis 5 Results:")
if p_value_data_volume < alpha:
    print("Reject Null Hypothesis: Customers with higher DATA_VOLUME are less likely to churn.")
else:
    print("Fail to Reject Null Hypothesis: No significant difference in DATA_VOLUME between churned and retained customers.")
print()

