In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505206 entries, 0 to 505206
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Age                    505206 non-null  float64
 1   Gender                 505206 non-null  object 
 2   Tenure                 505206 non-null  float64
 3   Usage Frequency        505206 non-null  float64
 4   Support Calls          505206 non-null  float64
 5   Payment Delay          505206 non-null  float64
 6   Total Spend            505206 non-null  float64
 7   Last Interaction       505206 non-null  float64
 8   Churn                  505206 non-null  float64
 9   Subscription_Contract  505206 non-null  int64  
dtypes: float64(8), int64(1), object(1)
memory usage: 42.4+ MB


In [3]:
from sklearn.preprocessing import ( StandardScaler,
                                    OneHotEncoder, OrdinalEncoder
                                    )

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import(   
                                Pipeline,
                                make_pipeline
                            )

from sklearn.ensemble import RandomForestClassifier     

from sklearn.model_selection import (   train_test_split,
                                        StratifiedKFold
                                    )



In [4]:
df= pd.concat([
    pd.read_csv('customer_churn_dataset-training-master.csv'),
    pd.read_csv('customer_churn_dataset-testing-master.csv')
],
    axis=0)

df.reset_index(drop= True, inplace= True)
df= df.dropna()

In [5]:
subscription_map = {
    'Basic': 3,
    'Premium': 1,
    'Standard': 2
}

contract_map = {
    'Annual': 2,
    'Monthly': 3,
    'Quarterly': 1
}
df['Subscription Type']=df['Subscription Type'].map(subscription_map)
df['Contract Length']=df['Contract Length'].map(contract_map)

df['Subscription_Contract'] = (df['Subscription Type']  + (df['Contract Length']-1)**2) 

In [6]:
Irr_cols= ['CustomerID', 'Subscription Type', 'Contract Length']
df=df.drop(columns= Irr_cols, axis=1)

In [7]:
numerical_features = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction', 'Subscription_Contract']
categorical_features_ohe = ['Gender']

num_pipeline= Pipeline(steps= [
    ('scale' , StandardScaler())
])

ohe_pipeline= Pipeline(steps= [
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output= False))
])

column_transformer= ColumnTransformer(transformers= [
    ('numeric_pipeline', num_pipeline, numerical_features),
    ('one_hot_pipeline', ohe_pipeline, categorical_features_ohe)
    ],
    remainder= 'drop',
    n_jobs= -1
)

In [8]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=42,  stratify=y) 

In [10]:
rf = RandomForestClassifier(n_estimators=500, bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=10)

In [11]:
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('classifier', rf)
])

In [12]:
pipeline.fit(X_train, y_train)

In [13]:
pipeline.score(X_test,y_test)

0.9366698996456919

In [14]:
y_pred= pipeline.predict(X_test)

In [22]:
y_pred_proba= pipeline.predict_proba(X_test)[:,1]

In [24]:
results_dict= {
    'y_true' : y_test,
    'y_pred' : y_pred,
    'y_pred_proba' : y_pred_proba
}

In [25]:
y_test.shape

(101042,)

In [26]:
y_pred.shape

(101042,)

In [27]:
y_pred_proba.shape

(101042,)

In [37]:
resultsdf= pd.DataFrame(results_dict)

In [38]:
resultsdf.head()

Unnamed: 0,y_true,y_pred,y_pred_proba
173749,1.0,1.0,0.72947
473413,0.0,1.0,0.862705
345623,0.0,0.0,0.0
189094,1.0,1.0,0.99527
35854,1.0,1.0,0.846613


In [39]:
resultsdf=resultsdf.reset_index()

In [41]:
resultsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101042 entries, 0 to 101041
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index         101042 non-null  int64  
 1   y_true        101042 non-null  float64
 2   y_pred        101042 non-null  float64
 3   y_pred_proba  101042 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 3.1 MB


In [42]:
resultsdf= resultsdf.drop(columns=['index'], axis=1)

In [43]:
resultsdf.to_csv('FINAL_RF_PREDICTIONS.csv', index= False)

In [44]:
import pickle

In [45]:
with open('RF_MODEL_FINALE.pkl', 'wb') as rf_finale:
    pickle.dump(pipeline, rf_finale)