# Feature Engineering – Telecom Customer Churn

This notebook focuses on transforming raw customer data into model-ready features.
The objectives are:
- Drop irrelevant and redundant features
- Encode categorical variables
- Scale numerical features
- Prepare data for ANN training


In [1]:
# import required libraries 

import pandas as pd
import numpy as np 


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [3]:
# load the dataset 

DATA_PATH = "../data/raw/customer_churn.csv"

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
# drop irrelevant and redundant features 

features_to_drop = [
  "phone number",
  "total day charge",
  "total eve charge",
  "total night charge",
  "total intl charge",
  "area code",
  "state"
]

df.drop(columns=features_to_drop, inplace=True)
df.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,churn
0,128,no,yes,25,265.1,110,197.4,99,244.7,91,10.0,3,1,False
1,107,no,yes,26,161.6,123,195.5,103,254.4,103,13.7,3,1,False
2,137,no,no,0,243.4,114,121.2,110,162.6,104,12.2,5,0,False
3,84,yes,no,0,299.4,71,61.9,88,196.9,89,6.6,7,2,False
4,75,yes,no,0,166.7,113,148.3,122,186.9,121,10.1,3,3,False


#### Encode categorical variables 

In [5]:
# binary encoding - internation plan and voice mail plan

binary_mapping = {
  "yes" : 1,
  "no" : 0
}

df["international plan"] = df["international plan"].map(binary_mapping)
df["voice mail plan"] = df["voice mail plan"].map(binary_mapping)

# encoding the target variable - churn 

df["churn"] = df["churn"].astype(int)

In [6]:
df.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total eve minutes,total eve calls,total night minutes,total night calls,total intl minutes,total intl calls,customer service calls,churn
0,128,0,1,25,265.1,110,197.4,99,244.7,91,10.0,3,1,0
1,107,0,1,26,161.6,123,195.5,103,254.4,103,13.7,3,1,0
2,137,0,0,0,243.4,114,121.2,110,162.6,104,12.2,5,0,0
3,84,1,0,0,299.4,71,61.9,88,196.9,89,6.6,7,2,0
4,75,1,0,0,166.7,113,148.3,122,186.9,121,10.1,3,3,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   account length          3333 non-null   int64  
 1   international plan      3333 non-null   int64  
 2   voice mail plan         3333 non-null   int64  
 3   number vmail messages   3333 non-null   int64  
 4   total day minutes       3333 non-null   float64
 5   total day calls         3333 non-null   int64  
 6   total eve minutes       3333 non-null   float64
 7   total eve calls         3333 non-null   int64  
 8   total night minutes     3333 non-null   float64
 9   total night calls       3333 non-null   int64  
 10  total intl minutes      3333 non-null   float64
 11  total intl calls        3333 non-null   int64  
 12  customer service calls  3333 non-null   int64  
 13  churn                   3333 non-null   int64  
dtypes: float64(4), int64(10)
memory usage: 3

#### Feature Target split

In [8]:
X = df.drop("churn", axis=1)
y = df["churn"]

print("Features shape: ", X.shape)
print("Target shape: ", y.shape)

Features shape:  (3333, 13)
Target shape:  (3333,)


#### Train Test split 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
  X, y, 
  test_size=0.2,
  random_state=42,
  stratify=y
)

print("Train set: ", X_train.shape)
print("Test set: ", X_test.shape)

Train set:  (2666, 13)
Test set:  (667, 13)


In [10]:
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)