In [18]:
# ---------------------------------------------------------------------------
# ACS WIL Data Analytics Project - Customer Churn Analysis
# ---------------------------------------------------------------------------

# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# read in the csv file and load the file into the dataframe
dataFile = './Preprocessed_Dataset/Dataset.csv'
dataFrame = pd.read_csv(dataFile, sep = ',')

# displaying the number of instances(rows) and attributes(cols) for checking
print('Number of Instances =', dataFrame.shape[0])
print('Number of Attributes =', dataFrame.shape[1])

Number of Instances = 7043
Number of Attributes = 10


In [19]:
# understanding the dataset
df_info = dataFrame.info()
print("-----------------------------------------------------------------")
dataFrame.head(10) # first 10 rows of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           7043 non-null   object 
 1   SeniorCitizen    7043 non-null   int64  
 2   Dependents       7043 non-null   object 
 3   tenure           7043 non-null   int64  
 4   PhoneService     7043 non-null   object 
 5   MultipleLines    7043 non-null   object 
 6   InternetService  7043 non-null   object 
 7   Contract         7043 non-null   object 
 8   MonthlyCharges   7043 non-null   float64
 9   Churn            7043 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 550.4+ KB
-----------------------------------------------------------------


Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
0,Female,0,No,1,No,No,DSL,Month-to-month,29.85,No
1,Male,0,No,34,Yes,No,DSL,One year,56.95,No
2,Male,0,No,2,Yes,No,DSL,Month-to-month,53.85,Yes
3,Male,0,No,45,No,No,DSL,One year,42.3,No
4,Female,0,No,2,Yes,No,Fiber optic,Month-to-month,70.7,Yes
5,Female,0,No,8,Yes,Yes,Fiber optic,Month-to-month,99.65,Yes
6,Male,0,Yes,22,Yes,Yes,Fiber optic,Month-to-month,89.1,No
7,Female,0,No,10,No,No,DSL,Month-to-month,29.75,No
8,Female,0,No,28,Yes,Yes,Fiber optic,Month-to-month,104.8,Yes
9,Male,0,Yes,62,Yes,No,DSL,One year,56.15,No


In [20]:
# check if there is any missing data points

attributes = dataFrame.columns

# going through each column to look for missing data
for att in attributes:
        #getting the total missing value in each attribute
        totalMissingValues = dataFrame[att].isnull().sum()

print("Total Missing Data:", totalMissingValues)

Total Missing Data: 0


In [21]:
# more specific info on missing data points

missing_values = dataFrame.isnull().sum()
missing_values

gender             0
SeniorCitizen      0
Dependents         0
tenure             0
PhoneService       0
MultipleLines      0
InternetService    0
Contract           0
MonthlyCharges     0
Churn              0
dtype: int64

In [22]:
# dataFrame['Dependents'].unique()

for att in attributes:
        # looking at the values of the columns
        print("Column:", att, "-->", dataFrame[att].unique())
        


Column: gender --> ['Female' 'Male']
Column: SeniorCitizen --> [0 1]
Column: Dependents --> ['No' 'Yes']
Column: tenure --> [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
Column: PhoneService --> ['No' 'Yes']
Column: MultipleLines --> ['No' 'Yes']
Column: InternetService --> ['DSL' 'Fiber optic']
Column: Contract --> ['Month-to-month' 'One year' 'Two year']
Column: MonthlyCharges --> [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
Column: Churn --> ['No' 'Yes']


In [23]:
# encoding categorical variables

# binary mapping(binary categorical)
dataFrame['gender'] = dataFrame['gender'].map({'Female': 0, 'Male': 1})
dataFrame['Dependents'] = dataFrame['Dependents'].map({'No': 0, 'Yes': 1})
dataFrame['PhoneService'] = dataFrame['PhoneService'].map({'No': 0, 'Yes': 1})
dataFrame['MultipleLines'] = dataFrame['MultipleLines'].map({'No': 0, 'Yes': 1})
dataFrame['InternetService'] = dataFrame['InternetService'].map({'DSL': 0, 'Fiber optic': 1})
dataFrame['Churn'] = dataFrame['Churn'].map({'No': 0, 'Yes': 1})

# one-hot encoding(Multi-class Categorical Variables)
encoded = pd.get_dummies(dataFrame, columns = ['Contract'])

# displaying the final encoded version of the dataset
encoded


Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,MonthlyCharges,Churn,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0,0,0,1,0,0,0,29.85,0,1,0,0
1,1,0,0,34,1,0,0,56.95,0,0,1,0
2,1,0,0,2,1,0,0,53.85,1,1,0,0
3,1,0,0,45,0,0,0,42.30,0,0,1,0
4,0,0,0,2,1,0,1,70.70,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,24,1,1,0,84.80,0,0,1,0
7039,0,0,1,72,1,1,1,103.20,0,0,1,0
7040,0,0,1,11,0,0,0,29.60,0,1,0,0
7041,1,1,0,4,1,1,1,74.40,1,1,0,0


In [24]:
# Appropriate scaling techniques are applied to normalise the data, enhancing model performance.



In [25]:
# splitting the dataset into distinct training and testing sets 
# possibly for cross validation
# saving the training and testing sets into their own folders????

