In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./Expresso_churn_dataset.csv')
df.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,,,,,NO,4,,,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,NO,17,On-net 1000F=10MilF;10d,1.0,0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,NO,62,"Data:1000F=5GB,7d",11.0,0
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,NO,11,Mixt 250F=Unlimited_call24H,2.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154048 entries, 0 to 2154047
Data columns (total 19 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         object 
 1   REGION          object 
 2   TENURE          object 
 3   MONTANT         float64
 4   FREQUENCE_RECH  float64
 5   REVENUE         float64
 6   ARPU_SEGMENT    float64
 7   FREQUENCE       float64
 8   DATA_VOLUME     float64
 9   ON_NET          float64
 10  ORANGE          float64
 11  TIGO            float64
 12  ZONE1           float64
 13  ZONE2           float64
 14  MRG             object 
 15  REGULARITY      int64  
 16  TOP_PACK        object 
 17  FREQ_TOP_PACK   float64
 18  CHURN           int64  
dtypes: float64(12), int64(2), object(5)
memory usage: 312.2+ MB


In [4]:
### DATA CLEANING
# Drop unnecessary columns
df.drop(columns=['user_id', 'TENURE', "TOP_PACK"], inplace=True)

In [5]:
# Fill in the categorical columns with the most frequent value
categorical_columns = df.select_dtypes(include=['object']).columns

for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)



In [6]:
# Fill in the numerical columns with the median value
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_columns:
    df[col].fillna(df[col].median(), inplace=True)

In [7]:
df.isnull().sum() #check if there are any null values left

REGION            0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
TIGO              0
ZONE1             0
ZONE2             0
MRG               0
REGULARITY        0
FREQ_TOP_PACK     0
CHURN             0
dtype: int64

In [8]:
#save the cleaned data
df.to_csv('cleaned_data.csv', index=False)

In [9]:
# Initialize the LabelEncoder
le = LabelEncoder()

# Encode categorical variables
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])
    
    # Show the mapping of original labels to encoded numbers
    label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"Encoding for column '{col}':")
    for label, number in label_mapping.items():
        print(f"  '{label}' : {number}")
    print("-" * 40)


Encoding for column 'REGION':
  'DAKAR' : 0
  'DIOURBEL' : 1
  'FATICK' : 2
  'KAFFRINE' : 3
  'KAOLACK' : 4
  'KEDOUGOU' : 5
  'KOLDA' : 6
  'LOUGA' : 7
  'MATAM' : 8
  'SAINT-LOUIS' : 9
  'SEDHIOU' : 10
  'TAMBACOUNDA' : 11
  'THIES' : 12
  'ZIGUINCHOR' : 13
----------------------------------------
Encoding for column 'MRG':
  'NO' : 0
----------------------------------------


In [10]:
### Split the data into features and target variable
features = df.drop(columns=['CHURN'])
labels = df['CHURN']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [12]:
### Train the Random Forest Classifier
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=5, random_state=42)
# Fit the model to the training data
rf_classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

In [13]:
y_pred[:10] # Check the first 10 predictions

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [14]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92    349773
           1       0.65      0.62      0.63     81037

    accuracy                           0.87    430810
   macro avg       0.78      0.77      0.78    430810
weighted avg       0.86      0.87      0.86    430810



In [15]:
# Save the model using pickle
import pickle
# Save the model to a file and label encoder
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)
with open('rf_classifier.pkl', 'wb') as model_file:
    pickle.dump(rf_classifier, model_file)
