# =============================================================
# MILESTONE 2: Advanced Data Analysis and Feature Engineering
# =============================================================

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from scipy import stats
from scipy.stats import chi2_contingency


from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# Set plotting style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [3]:
url = "https://raw.githubusercontent.com/Reemsoliiman/customer-churn-prediction-analysis/main/data/processed/cleaned_data.csv"
df = pd.read_csv(url)

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")


Dataset loaded: 2666 rows, 77 columns


# ========================
# 1. Advanced Data Analysis
# ========================

# -------------------------------------
# 1.1 statistical tests
# -------------------------------------

In [8]:
# T-test for Customer service calls
group1 = df[df['Churn'] == False]['Customer service calls']
group2 = df[df['Churn'] == True]['Customer service calls']
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"T-test for Customer service calls: t={t_stat:.3f}, p={p_value:.3f}")

T-test for Customer service calls: t=-10.678, p=0.000


In [9]:
# T-test for Total day minutes
group1 = df[df['Churn'] == False]['Total day minutes']
group2 = df[df['Churn'] == True]['Total day minutes']
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"T-test for Total day minutes: t={t_stat:.3f}, p={p_value:.3f}")

T-test for Total day minutes: t=-10.278, p=0.000


# -------------------------------------
# 1.2 identifying most relevant features
# -------------------------------------

# ========================
# 2. Feature Engineering
# ========================

# -------------------------------------
# 2.1 Create New Features
# -------------------------------------

In [3]:
#Rename column
df['Customer tenure'] = df['Account length']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 78 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Account length                2666 non-null   int64  
 1   Number vmail messages         2666 non-null   int64  
 2   Total day minutes             2666 non-null   float64
 3   Total day calls               2666 non-null   int64  
 4   Total day charge              2666 non-null   float64
 5   Total eve minutes             2666 non-null   float64
 6   Total eve calls               2666 non-null   int64  
 7   Total eve charge              2666 non-null   float64
 8   Total night minutes           2666 non-null   float64
 9   Total night calls             2666 non-null   int64  
 10  Total night charge            2666 non-null   float64
 11  Total intl minutes            2666 non-null   float64
 12  Total intl calls              2666 non-null   int64  
 13  Tot

# -------------------------------------
# 2.1.1 Usage Patterns
# -------------------------------------

In [5]:
#Average daily usage
df['average daily usage'] = df['Total_Minutes'] / df['Customer tenure']

In [6]:
#Average calls per day
df['Average calls per day'] = (
df['Total_Calls']/df['Customer tenure']
)

In [7]:
#Average minutes per call
df['Average minutes per call'] = np.where(
    df['Total_Calls'] > 0,
    df['Total_Minutes'] / df['Total_Calls'],
    0
)

# -------------------------------------
# 2.1.2 Frequency Of Interactions
# -------------------------------------

In [8]:
#Complaint indicator
df['High service calls'] = (df['Customer service calls'] > 3).astype(int)

In [9]:
#Interaction frequency index
df['Calls per tenure'] = df['Total_Calls'] / df['Customer tenure']

# -------------------------------------
# 2.2 FEATURE TRANSFORMATION
# -------------------------------------

In [10]:
#Log transform highly skewed features
for col in ['Total_Minutes', 'average daily usage', 'Average minutes per call']:
    df[f'log_{col.replace(" ", "_").lower()}'] = np.log1p(df[col])  # log1p handles zeros safely

# -------------------------------------
# 2.3 FEATURE SCALING
# -------------------------------------

In [11]:
scaler = MinMaxScaler()
numeric_features = [
    'Customer tenure',
    'Total_Minutes',
    'average daily usage',
    'Total_Calls',
    'Average calls per day',
    'Average minutes per call',
    'Calls per tenure'
]
df_scaled = df.copy()
df_scaled[numeric_features] = scaler.fit_transform(df[numeric_features])

# -------------------------------------
# 2.4 ENCODING CATEGORICAL VARIABLES
# -------------------------------------

In [12]:
#One-hot encode binary categorical variables(already encoded in milestone 1)
df_encoded =df_scaled

In [13]:
print(df_scaled.columns)

Index(['Account length', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn', 'Total_Charge', 'Total_Minutes', 'Avg_Minute_Cost',
       'Total_Calls', 'High_Customer_Service', 'State_AL', 'State_AR',
       'State_AZ', 'State_CA', 'State_CO', 'State_CT', 'State_DC', 'State_DE',
       'State_FL', 'State_GA', 'State_HI', 'State_IA', 'State_ID', 'State_IL',
       'State_IN', 'State_KS', 'State_KY', 'State_LA', 'State_MA', 'State_MD',
       'State_ME', 'State_MI', 'State_MN', 'State_MO', 'State_MS', 'State_MT',
       'State_NC', 'State_ND', 'State_NE', 'State_NH', 'State_NJ', 'State_NM',
       'State_NV', 'State_NY', 'State_OH', 'State_OK', 'State_OR', 'State_PA',
       'State_RI', 'State_SC', 'St

In [14]:
print("Final feature set shape:", df_encoded.shape)
df_encoded.head()

Final feature set shape: (2666, 86)


Unnamed: 0,Account length,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,...,Account_Length_Binned_Long,Customer tenure,average daily usage,Average calls per day,Average minutes per call,High service calls,Calls per tenure,log_total_minutes,log_average_daily_usage,log_average_minutes_per_call
0,128,25,265.1,110,45.07,197.4,99,16.78,244.7,91,...,True,0.613527,0.005248,0.002942,0.575932,0,0.002942,6.576748,1.887543,1.214021
1,107,26,161.6,123,27.47,195.5,103,16.62,254.4,103,...,False,0.512077,0.005586,0.004879,0.398252,0,0.004879,6.43967,1.923225,1.058877
2,137,0,243.4,114,41.38,121.2,110,10.3,162.6,104,...,True,0.657005,0.002899,0.003109,0.301561,0,0.003109,6.29231,1.596804,0.963106
3,84,0,299.4,71,50.9,61.9,88,5.26,196.9,89,...,False,0.400966,0.006828,0.004702,0.520081,0,0.004702,6.338241,2.044308,1.167797
4,75,0,166.7,113,28.34,148.3,122,12.61,186.9,121,...,False,0.357488,0.006973,0.009313,0.230456,0,0.009313,6.240276,2.057537,0.88632
