In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# DATA PREPROCESSING

In [3]:
df_test = pd.read_csv("customer_churn_dataset-testing-master.csv")
df_train = pd.read_csv("customer_churn_dataset-training-master.csv")
df_test.head(10)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0
5,6,30,Male,41,14,10,10,Premium,Monthly,500,29,0
6,7,47,Female,37,15,9,28,Basic,Quarterly,574,14,1
7,8,54,Female,36,11,0,18,Standard,Monthly,323,16,0
8,9,36,Male,20,5,10,8,Basic,Monthly,687,8,0
9,10,65,Male,8,4,2,23,Basic,Annual,995,10,0


In [4]:
df_train.head(10)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2,30,Female,39,14,5,18,Standard,Annual,932.0,17.0,1.0
1,3,65,Female,49,1,10,8,Basic,Monthly,557.0,6.0,1.0
2,4,55,Female,14,4,6,18,Basic,Quarterly,185.0,3.0,1.0
3,5,58,Male,38,21,7,7,Standard,Monthly,396.0,29.0,1.0
4,6,23,Male,32,20,5,8,Basic,Monthly,617.0,20.0,1.0
5,8,51,Male,33,25,9,26,Premium,Annual,129.0,8.0,1.0
6,9,58,Female,49,12,3,16,Standard,Quarterly,821.0,24.0,1.0
7,10,55,Female,37,8,4,15,Premium,Annual,445.0,30.0,1.0
8,11,39,Male,12,5,7,4,Standard,Quarterly,969.0,13.0,1.0
9,12,64,Female,3,25,2,11,Standard,Quarterly,415.0,29.0,1.0


In [5]:
print('test data shape: ', df_test.shape)
print("train data shape: ", df_train.shape)

test data shape:  (64374, 12)
train data shape:  (122979, 12)


In [6]:
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122979 entries, 0 to 122978
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         122979 non-null  int64  
 1   Age                122979 non-null  int64  
 2   Gender             122979 non-null  object 
 3   Tenure             122979 non-null  int64  
 4   Usage Frequency    122979 non-null  int64  
 5   Support Calls      122979 non-null  int64  
 6   Payment Delay      122979 non-null  int64  
 7   Subscription Type  122978 non-null  object 
 8   Contract Length    122978 non-null  object 
 9   Total Spend        122978 non-null  float64
 10  Last Interaction   122978 non-null  float64
 11  Churn              122978 non-null  float64
dtypes: float64(3), int64(6), object(3)
memory usage: 11.3+ MB
None


In [7]:
print(df_train.describe())

          CustomerID            Age         Tenure  Usage Frequency  \
count  122979.000000  122979.000000  122979.000000    122979.000000   
mean    62961.060604      41.646915      30.466998        15.472568   
std     36786.858884      13.943301      17.349414         8.673194   
min         2.000000      18.000000       1.000000         1.000000   
25%     30753.500000      29.000000      15.000000         8.000000   
50%     62788.000000      42.000000      30.000000        15.000000   
75%     94879.500000      54.000000      46.000000        23.000000   
max    126924.000000      65.000000      60.000000        30.000000   

       Support Calls  Payment Delay    Total Spend  Last Interaction  \
count  122979.000000  122979.000000  122978.000000     122978.000000   
mean        5.108701      15.157246     544.078746         15.559539   
std         3.151556       8.966403     259.592167          8.656301   
min         0.000000       0.000000     100.000000          1.000000   


In [8]:
# Checcking for null values
print(df_train.isnull().sum())

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    1
Contract Length      1
Total Spend          1
Last Interaction     1
Churn                1
dtype: int64


In [9]:
# Eliminating Null values
df_train_clean = df_train.dropna()
print(df_train_clean.isnull().sum())

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64


In [10]:
# Outliers
# Select numerical columns for outlier detection
numerical_columns = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']

# Calculate Z-scores for numerical columns
z_scores = stats.zscore(df_train_clean[numerical_columns])

# Define threshold for Z-score (e.g., ±3 standard deviations)
threshold = 3

# Identify outliers
outliers = (z_scores > threshold) | (z_scores < -threshold)

In [11]:
# Remove outliers from the dataset
df_clean = df_train_clean[~outliers.any(axis=1)]
df_clean.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2,30,Female,39,14,5,18,Standard,Annual,932.0,17.0,1.0
1,3,65,Female,49,1,10,8,Basic,Monthly,557.0,6.0,1.0
2,4,55,Female,14,4,6,18,Basic,Quarterly,185.0,3.0,1.0
3,5,58,Male,38,21,7,7,Standard,Monthly,396.0,29.0,1.0
4,6,23,Male,32,20,5,8,Basic,Monthly,617.0,20.0,1.0
