STEP 3: create new features like customer tenure and average daily usage

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
url = "https://raw.githubusercontent.com/Reemsoliiman/customer-churn-prediction-analysis/main/data/raw/churn-bigml-80.csv"
df = pd.read_csv(url)

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded: 2666 rows, 20 columns


1)FEATURE ENGINEERING:

A)Create New Features

In [3]:
#Rename column
df['Customer tenure'] = df['Account length']

B)Usage Patterns

In [4]:
#New Feature: Total usage per minutes
df['Total usage per minutes'] = ( df['Total day minutes']+
df['Total eve minutes']+
df['Total intl minutes']+
df['Total night minutes'])

In [5]:
#Average daily usage
df['average daily usage'] = df['Total usage per minutes'] / df['Customer tenure']

In [6]:
#Total calls
df['Total calls'] = (
df['Total day calls']+
df['Total eve calls']+
df['Total intl calls']+
df['Total night calls']
)

In [7]:
#Average calls per day
df['Average calls per day'] = (
df['Total calls']/df['Customer tenure']
)

In [8]:
#Average minutes per call
df['Average minutes per call'] = np.where(
    df['Total calls'] > 0,
    df['Total usage per minutes'] / df['Total calls'],
    0
)

C)Frequency Of Interactions

In [9]:
#Complaint indicator
df['High service calls'] = (df['Customer service calls'] > 3).astype(int)

In [10]:
#Interaction frequency index
df['Calls per tenure'] = df['Total calls'] / df['Customer tenure']

2)FEATURE TRANSFORMATION

In [11]:
#Log transform highly skewed features
for col in ['Total usage per minutes', 'average daily usage', 'Average minutes per call']:
    df[f'log_{col.replace(" ", "_").lower()}'] = np.log1p(df[col])  # log1p handles zeros safely

3)FEATURE SCALING

In [12]:
scaler = MinMaxScaler()
numeric_features = [
    'Customer tenure',
    'Total usage per minutes',
    'average daily usage',
    'Total calls',
    'Average calls per day',
    'Average minutes per call',
    'Calls per tenure'
]
df_scaled = df.copy()
df_scaled[numeric_features] = scaler.fit_transform(df[numeric_features])

4)ENCODING CATEGORICAL VARIABLES

In [13]:
#One-hot encode binary categorical variables
df_encoded = pd.get_dummies(
    df_scaled,
    columns=['International plan', 'Voice mail plan'],
    drop_first=True
)

In [14]:
print("Final feature set shape:", df_encoded.shape)
df_encoded.head()

Final feature set shape: (2666, 31)


Unnamed: 0,State,Account length,Area code,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,...,Total calls,Average calls per day,Average minutes per call,High service calls,Calls per tenure,log_total_usage_per_minutes,log_average_daily_usage,log_average_minutes_per_call,International plan_Yes,Voice mail plan_Yes
0,KS,128,415,25,265.1,110,45.07,197.4,99,16.78,...,0.497778,0.003209,0.557426,0,0.003209,6.576748,1.887543,1.214021,False,True
1,OH,107,415,26,161.6,123,27.47,195.5,103,16.62,...,0.626667,0.005146,0.385456,0,0.005146,6.43967,1.923225,1.058877,False,True
2,NJ,137,415,0,243.4,114,41.38,121.2,110,10.3,...,0.631111,0.003376,0.291872,0,0.003376,6.29231,1.596804,0.963106,False,False
3,OH,84,408,0,299.4,71,50.9,61.9,88,5.26,...,0.284444,0.004969,0.50337,0,0.004969,6.338241,2.044308,1.167797,True,False
4,OK,75,415,0,166.7,113,28.34,148.3,122,12.61,...,0.746667,0.009579,0.223051,0,0.009579,6.240276,2.057537,0.88632,True,False
