<a href="https://colab.research.google.com/github/Njeri-Gitome/Telco_Customer_Churn_Model/blob/main/Data_Preprocessing_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Pre-processing and Feature Engineering

In [1]:
#Libraries
import pandas as pd

#mount drive
from google.colab import drive
drive.mount('/content/drive')

#load data
#Loading data
data = pd.read_csv('/content/drive/MyDrive/Datasets/Telco_Churn/Telco-Customer-Churn.csv')

Mounted at /content/drive


As observed from the data cleaning, the data is already clean (check the EDA file)

In [2]:
data.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [10]:
import numpy as np
#Converting TotalCharges into a numerical columnn
'''
The TotalCharges column is numerical hence the need to convert the data type from object to numerical
'''
def convert_total_charges_to_float(data):
  # Replace empty string with Nan
  data['TotalCharges'] =data['TotalCharges'].replace('',np.nan)

  # Convert the string variables to numeric
  data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors ='coerce')


# function call
convert_total_charges_to_float(data)

## Handling categorical variables

In [22]:
#Change data type for Senior Citizen column as it is a cetegorical column
data['SeniorCitizen'] =data['SeniorCitizen'].astype('object')

In [12]:
#categorical columns
data.select_dtypes(include =['object']).columns


Index(['customerID', 'MultipleLines', 'InternetService', 'Contract',
       'PaymentMethod'],
      dtype='object')

As observed, most of the data is nominal hence the need to perform label encoding on the columns

In [5]:
# encoding gender
data['gender'] = data['gender'].apply(lambda x: 0 if x=='Male' else 1)

In [7]:
# Binary encoding "yes-no" columns
yes_no_columns = ['Partner', 'Dependents', 'PhoneService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']

for col in yes_no_columns:
   data[col] = data[col].apply(lambda x: 0 if x == "No" else 1)

In [8]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,0,1,0,1,0,No phone service,DSL,0,...,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,0,0,0,0,34,1,No,DSL,1,...,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,0,0,0,0,2,1,No,DSL,1,...,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,0,0,0,0,45,0,No phone service,DSL,1,...,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,1,0,0,0,2,1,No,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1


#### Handling MultipleLines variable

In [13]:
df = data.copy()

In [14]:
# Encoding of Multiple Lines variable
def encode_lines (dataframe, column_name):

  lines_map ={'No' : 0, 'Yes': 1, 'No phone service': 2}

  #mapping the labels
  dataframe[column_name] = dataframe[column_name].map(lines_map)

  return dataframe

#Function call
df= encode_lines(df,'MultipleLines')

### Handling Internet Service variable

In [15]:
# Label encoding of Internet Services

def encode_internet(dataframe,column_name):

  internet_map ={'No': 0, 'DSL': 1, 'Fiber optic': 2}

  #mapping labels
  dataframe[column_name] = dataframe[column_name].map(internet_map)

  return dataframe

#Function call
df = encode_internet(df, 'InternetService')


### Handling contract variable

In [18]:
def encode_contract(dataframe, column_name):

  contract_map = {'Month-to-month': 0,
                  'One year': 1,
                  'Two year': 2}
  #mapping labels
  dataframe[column_name] = dataframe[column_name].map(contract_map)

  return dataframe

# Function call
df = encode_contract(df, 'Contract')

### handling Payment Method variable

In [20]:
def encode_payment(dataframe, column_name):

  payment_map ={'Electronic check': 0,
                'Mailed check': 1,
                'Bank transfer (automatic)': 2,
                'Credit card (automatic)': 3}

  #mapping labels
  dataframe[column_name] = dataframe[column_name].map(payment_map)

  return dataframe

#Function call
df = encode_payment(df, 'PaymentMethod')

In [24]:
# Save the preprocessed data into a file
df.to_csv('preprocessed_telco.csv', index =False)