In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from IPython.display import display

In [2]:
# Load dataset
df = pd.read_csv('Customer Churn.csv')
print('Loaded', df.shape)
display(df.head())
print('\nColumns and dtypes:')
display(df.dtypes.to_frame('dtype'))
print('\nBasic describe:')
display(df.describe(include='all').T)

Loaded (3150, 14)


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0



Columns and dtypes:


Unnamed: 0,dtype
Call Failure,int64
Complains,int64
Subscription Length,int64
Charge Amount,int64
Seconds of Use,int64
Frequency of use,int64
Frequency of SMS,int64
Distinct Called Numbers,int64
Age Group,int64
Tariff Plan,int64



Basic describe:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Call Failure,3150.0,7.627937,7.263886,0.0,1.0,6.0,12.0,36.0
Complains,3150.0,0.076508,0.265851,0.0,0.0,0.0,0.0,1.0
Subscription Length,3150.0,32.541905,8.573482,3.0,30.0,35.0,38.0,47.0
Charge Amount,3150.0,0.942857,1.521072,0.0,0.0,0.0,1.0,10.0
Seconds of Use,3150.0,4472.459683,4197.908687,0.0,1391.25,2990.0,6478.25,17090.0
Frequency of use,3150.0,69.460635,57.413308,0.0,27.0,54.0,95.0,255.0
Frequency of SMS,3150.0,73.174921,112.23756,0.0,6.0,21.0,87.0,522.0
Distinct Called Numbers,3150.0,23.509841,17.217337,0.0,10.0,21.0,34.0,97.0
Age Group,3150.0,2.826032,0.892555,1.0,2.0,3.0,3.0,5.0
Tariff Plan,3150.0,1.077778,0.267864,1.0,1.0,1.0,1.0,2.0


In [3]:
# Normalize column names
old_cols = list(df.columns)
df.columns = [" ".join(c.strip().split()) for c in df.columns]
print('Columns normalized (sample):')
display(pd.DataFrame({'before': old_cols, 'after': list(df.columns)}).head())

Columns normalized (sample):


Unnamed: 0,before,after
0,Call Failure,Call Failure
1,Complains,Complains
2,Subscription Length,Subscription Length
3,Charge Amount,Charge Amount
4,Seconds of Use,Seconds of Use


In [4]:
# Trim whitespace from object columns
obj_cols = df.select_dtypes(include=[object]).columns.tolist()
# show sample before/after for first object col if present
if obj_cols:
    c0 = obj_cols[0]
    print('Sample before trimming (first 5 of', c0, '):')
    display(df[c0].head().astype(str))
for c in obj_cols:
    df[c] = df[c].astype(str).str.strip()
print('Trimmed object columns (count):', len(obj_cols))
print('Dtypes after trim:')
display(df[obj_cols].dtypes.to_frame('dtype'))

Trimmed object columns (count): 0
Dtypes after trim:


Unnamed: 0,dtype


In [5]:
# Coerce object columns that are numeric-like (majority rule)
coerced_cols = []
for c in list(obj_cols):
    coerced = pd.to_numeric(df[c].str.replace(',', ''), errors='coerce')
    non_na = coerced.notna().sum()
    if non_na >= 0.5 * len(df):
        df[c] = coerced
        coerced_cols.append(c)
        print(f'Coerced {c} to numeric, non-numeric dropped: {len(df)-non_na}')
print('Total coerced columns:', coerced_cols)
print('Dtypes update (sample):')
display(df[coerced_cols].dtypes.to_frame('dtype') if coerced_cols else 'No coerced columns')

Total coerced columns: []
Dtypes update (sample):


'No coerced columns'

In [7]:
# Determine numeric and categorical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in df.columns if c not in num_cols]
print('Numeric cols (count):', len(num_cols))
print('Categorical cols (count):', len(cat_cols))
# build dataframe from Series so differing lengths pad with NaN instead of raising
display(pd.DataFrame({'numeric_sample': pd.Series(num_cols[:10]), 'categorical_sample': pd.Series(cat_cols[:10])}))

Numeric cols (count): 14
Categorical cols (count): 0


Unnamed: 0,numeric_sample,categorical_sample
0,Call Failure,
1,Complains,
2,Subscription Length,
3,Charge Amount,
4,Seconds of Use,
5,Frequency of use,
6,Frequency of SMS,
7,Distinct Called Numbers,
8,Age Group,
9,Tariff Plan,


In [8]:
# Ensure Churn is integer
if 'Churn' in df.columns:
    df['Churn'] = pd.to_numeric(df['Churn'], errors='coerce').fillna(0).astype(int)
    print('Churn converted to int')
    print('Churn distribution:')
    display(df['Churn'].value_counts(dropna=False).to_frame('count'))
else:
    print('Warning: Churn column not found')

Churn converted to int
Churn distribution:


Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,2655
1,495


In [9]:
# Impute numeric columns with median (recompute numeric cols first)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print('Numeric cols to impute (count):', len(num_cols))
for c in num_cols:
    missing_before = df[c].isna().sum()
    median = df[c].median()
    df[c] = df[c].fillna(median)
    missing_after = df[c].isna().sum()
    if missing_before>0:
        print(f'{c}: missing {missing_before} -> {missing_after}')
print('Imputed numeric columns with median')

Numeric cols to impute (count): 14
Imputed numeric columns with median


In [10]:
# Impute categorical columns with 'missing' and cast to category
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in df.columns if c not in num_cols]
print('Categorical columns (count):', len(cat_cols))
for c in cat_cols:
    missing_before = df[c].isna().sum() if c in df else 0
    df[c] = df[c].fillna('missing').astype('category')
    if missing_before>0:
        print(f'{c}: filled {missing_before} missing')
print('Imputed categorical columns and cast to category')
# show categories for first few categorical columns
for c in cat_cols[:5]:
    print(c, '->', df[c].cat.categories[:10])

Categorical columns (count): 0
Imputed categorical columns and cast to category


In [11]:
# Add log1p transforms for skewed numeric features if present
skew_candidates = ['Charge Amount', 'Customer Value', 'Seconds of Use', 'Frequency of use', 'Frequency of SMS', 'Distinct Called Numbers']
added = []
for c in skew_candidates:
    if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
        new_col = f"{c} (log1p)"
        df[new_col] = np.log1p(df[c].clip(lower=0))
        added.append(new_col)
        print('Added log1p for', c)
print('New log columns:', added)

Added log1p for Charge Amount
Added log1p for Customer Value
Added log1p for Seconds of Use
Added log1p for Frequency of use
Added log1p for Frequency of SMS
Added log1p for Distinct Called Numbers
New log columns: ['Charge Amount (log1p)', 'Customer Value (log1p)', 'Seconds of Use (log1p)', 'Frequency of use (log1p)', 'Frequency of SMS (log1p)', 'Distinct Called Numbers (log1p)']


In [12]:
# Final shape and sample
print('Final shape:', df.shape)
print('Final dtypes (sample):')
display(df.dtypes.to_frame('dtype'))
print('Churn distribution:')
display(df['Churn'].value_counts(dropna=False).to_frame('count'))
display(df.head())

Final shape: (3150, 20)
Final dtypes (sample):


Unnamed: 0,dtype
Call Failure,int64
Complains,int64
Subscription Length,int64
Charge Amount,int64
Seconds of Use,int64
Frequency of use,int64
Frequency of SMS,int64
Distinct Called Numbers,int64
Age Group,int64
Tariff Plan,int64


Churn distribution:


Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,2655
1,495


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn,Charge Amount (log1p),Customer Value (log1p),Seconds of Use (log1p),Frequency of use (log1p),Frequency of SMS (log1p),Distinct Called Numbers (log1p)
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0,0.0,5.291494,8.382747,4.276666,1.791759,2.890372
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0,0.0,3.850892,5.765191,1.791759,2.079442,1.609438
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0,0.0,7.337926,7.805475,4.110874,5.886104,3.218876
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0,0.0,5.48488,8.342602,4.204693,0.693147,3.583519
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0,0.0,4.989105,7.780721,4.077537,1.098612,3.526361


In [13]:
# Stratified train/test split and save
if 'Churn' not in df.columns:
    raise KeyError('Churn column required for stratified split')
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Churn'])
print('Train shape:', train_df.shape, 'Test shape:', test_df.shape)
print('Train churn distribution:')
display(train_df['Churn'].value_counts().to_frame('count'))
print('Test churn distribution:')
display(test_df['Churn'].value_counts().to_frame('count'))
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
print('Saved train.csv and test.csv')

Train shape: (2520, 20) Test shape: (630, 20)
Train churn distribution:


Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,2124
1,396


Test churn distribution:


Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
0,531
1,99


Saved train.csv and test.csv
