In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [12]:
n_samples = 1000

data = {
    'customer_id': range(1, n_samples + 1),
    'age': np.random.randint(18, 80, n_samples),
    'income': np.random.normal(50000, 20000, n_samples),
    'account_balance': np.random.normal(5000, 3000, n_samples),
    'tenure_months': np.random.randint(1, 120, n_samples),
    'num_products': np.random.randint(1, 5, n_samples),
    'credit_score': np.random.randint(300, 850, n_samples),
    'gender': np.random.choice(['Male', 'Female', 'Other'], n_samples),
    'location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples),
    'customer_service_calls': np.random.randint(0, 10, n_samples),
    'churned': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
}

df = pd.DataFrame(data)


In [13]:
# Introduce data quality issues
# 1. Missing values
df.loc[np.random.choice(df.index, 50), 'age'] = np.nan
df.loc[np.random.choice(df.index, 80), 'income'] = np.nan
df.loc[np.random.choice(df.index, 30), 'credit_score'] = np.nan
df.loc[np.random.choice(df.index, 20), 'gender'] = np.nan

# 2. Outliers
df.loc[np.random.choice(df.index, 10), 'income'] = np.random.uniform(200000, 500000, 10)
df.loc[np.random.choice(df.index, 5), 'account_balance'] = np.random.uniform(-10000, -5000, 5)

# 3. Inconsistent data
df.loc[np.random.choice(df.index, 10), 'gender'] = df.loc[np.random.choice(df.index, 10), 'gender'].str.lower()

# 4. Duplicates
duplicate_rows = df.sample(5)
df = pd.concat([df, duplicate_rows], ignore_index=True)

print(f"\nDataset created with {len(df)} rows and {len(df.columns)} columns")
print("\nFirst few rows:")
print(df.head())



Dataset created with 1005 rows and 11 columns

First few rows:
   customer_id   age        income  account_balance  tenure_months  \
0            1  56.0  77447.555931      4737.085227              9   
1            2  69.0  36444.278140      5458.153698             36   
2            3  46.0  73070.637440      6764.599805             83   
3            4  32.0  42499.787433      6526.004749              1   
4            5  60.0  36117.080930     -2295.140714             10   

   num_products  credit_score  gender  location  customer_service_calls  \
0             1         699.0    Male     Rural                       6   
1             2         751.0  Female  Suburban                       6   
2             4         403.0    Male  Suburban                       3   
3             1         761.0   Other     Rural                       8   
4             1         695.0  Female     Urban                       7   

   churned  
0        0  
1        0  
2        1  
3        0  

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1005 entries, 0 to 1004
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             1005 non-null   int64  
 1   age                     956 non-null    float64
 2   income                  927 non-null    float64
 3   account_balance         1005 non-null   float64
 4   tenure_months           1005 non-null   int64  
 5   num_products            1005 non-null   int64  
 6   credit_score            977 non-null    float64
 7   gender                  974 non-null    object 
 8   location                1005 non-null   object 
 9   customer_service_calls  1005 non-null   int64  
 10  churned                 1005 non-null   int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 86.5+ KB


In [15]:
df.shape

(1005, 11)

In [24]:
df.columns

Index(['customer_id', 'age', 'income', 'account_balance', 'tenure_months',
       'num_products', 'credit_score', 'gender', 'location',
       'customer_service_calls', 'churned'],
      dtype='object')

In [16]:
df.describe()

Unnamed: 0,customer_id,age,income,account_balance,tenure_months,num_products,credit_score,customer_service_calls,churned
count,1005.0,956.0,927.0,1005.0,1005.0,1005.0,977.0,1005.0,1005.0
mean,500.976119,50.08682,54186.821944,5074.146555,59.973134,2.536318,578.943705,4.555224,0.266667
std,289.239395,18.124,39006.384482,3169.170188,33.712286,1.118446,161.728217,2.958239,0.442437
min,1.0,18.0,-21364.105987,-9867.765487,1.0,1.0,300.0,0.0,0.0
25%,250.0,35.0,37452.705739,3011.645596,32.0,2.0,433.0,2.0,0.0
50%,501.0,50.0,51653.119189,5178.658905,59.0,3.0,572.0,5.0,0.0
75%,751.0,66.0,65579.159507,7204.68051,88.0,4.0,727.0,7.0,1.0
max,1000.0,79.0,445063.395768,14763.941879,119.0,4.0,849.0,9.0,1.0


In [17]:
df.dtypes

customer_id                 int64
age                       float64
income                    float64
account_balance           float64
tenure_months               int64
num_products                int64
credit_score              float64
gender                     object
location                   object
customer_service_calls      int64
churned                     int64
dtype: object

In [18]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_percent
})
print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))


              Missing_Count  Percentage
income                   78    7.761194
age                      49    4.875622
gender                   31    3.084577
credit_score             28    2.786070


In [19]:
print("\n3.2 Duplicate Rows:")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print("\nDuplicate rows:")
    print(df[df.duplicated(keep=False)].sort_values('customer_id'))



3.2 Duplicate Rows:
Number of duplicate rows: 5

Duplicate rows:
      customer_id   age         income  account_balance  tenure_months  \
153           154  45.0   62876.052034      7204.680510            113   
1002          154  45.0   62876.052034      7204.680510            113   
201           202  54.0            NaN      3228.565794              9   
1000          202  54.0            NaN      3228.565794              9   
732           733  67.0   62360.574220      2987.491890             75   
1001          733  67.0   62360.574220      2987.491890             75   
924           925  58.0            NaN      4457.718531             89   
1004          925  58.0            NaN      4457.718531             89   
966           967  50.0  445063.395768       959.712774             72   
1003          967  50.0  445063.395768       959.712774             72   

      num_products  credit_score gender  location  customer_service_calls  \
153              2         559.0    NaN   

In [20]:
print("\n3.3 Unique Values per Column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")


3.3 Unique Values per Column:
customer_id: 1000 unique values
age: 62 unique values
income: 924 unique values
account_balance: 1000 unique values
tenure_months: 119 unique values
num_products: 4 unique values
credit_score: 451 unique values
gender: 3 unique values
location: 3 unique values
customer_service_calls: 10 unique values
churned: 2 unique values


In [21]:
print("\n3.4 Value Counts for Categorical Variables:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())


3.4 Value Counts for Categorical Variables:

gender:
gender
Female    335
Male      326
Other     313
Name: count, dtype: int64

location:
location
Suburban    353
Rural       333
Urban       319
Name: count, dtype: int64


In [22]:
print("\n4.1 Removing duplicates...")
df_clean = df.drop_duplicates()
print(f"Rows after removing duplicates: {len(df_clean)}")


4.1 Removing duplicates...
Rows after removing duplicates: 1000


In [23]:
print("\n4.2 Standardizing categorical values...")
df_clean['gender'] = df_clean['gender'].str.title()
print("Gender values after standardization:")
print(df_clean['gender'].value_counts())


4.2 Standardizing categorical values...
Gender values after standardization:
gender
Female    335
Male      322
Other     313
Name: count, dtype: int64


In [40]:
# 4.3 Handle negative account balances (domain-specific cleaning)
print("\n4.3 Handling negative account balances...")
print(f"Rows with negative balance: {(df_clean['account_balance'] < 0).sum()}")
df_clean['account_balance'] = df_clean['account_balance'].clip(lower=0)



4.3 Handling negative account balances...
Rows with negative balance: 0


In [41]:
# ============================================
# STEP 5: EXPLORATORY DATA ANALYSIS (EDA)
# ============================================
print("\n" + "="*60)
print("STEP 5: EXPLORATORY DATA ANALYSIS")
print("="*60)


STEP 5: EXPLORATORY DATA ANALYSIS


In [45]:
print("\n5.1 Target Variable Distribution:")
print(df_clean['churned'].value_counts())
print(f"\nChurn Rate: {df_clean['churned'].mean()*100:.2f}%")



5.1 Target Variable Distribution:
churned
0    733
1    267
Name: count, dtype: int64

Churn Rate: 26.70%


In [46]:
print("\n5.2 Correlation Analysis:")
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
correlation_matrix = df_clean[numeric_cols].corr()
print("\nCorrelation with target variable 'churned':")
print(correlation_matrix['churned'].sort_values(ascending=False))



5.2 Correlation Analysis:

Correlation with target variable 'churned':
churned                   1.000000
customer_id               0.025860
account_balance           0.004023
num_products             -0.004407
income                   -0.006685
tenure_months            -0.017403
credit_score             -0.019715
customer_service_calls   -0.024986
age                      -0.038009
Name: churned, dtype: float64


In [48]:
print("\n" + "="*60)
print("STEP 6: OUTLIER DETECTION AND HANDLING")
print("="*60)

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Detect outliers in numerical columns
numerical_columns = ['age', 'income', 'account_balance', 'credit_score']
for col in numerical_columns:
    outliers, lower, upper = detect_outliers_iqr(df_clean, col)
    print(f"\n{col}:")
    print(f"  Outliers detected: {len(outliers)}")
    print(f"  Lower bound: {lower:.2f}, Upper bound: {upper:.2f}")



STEP 6: OUTLIER DETECTION AND HANDLING

age:
  Outliers detected: 0
  Lower bound: -11.50, Upper bound: 112.50

income:
  Outliers detected: 17
  Lower bound: -4652.42, Upper bound: 107604.89

account_balance:
  Outliers detected: 3
  Lower bound: -3246.96, Upper bound: 13477.90

credit_score:
  Outliers detected: 0
  Lower bound: -8.38, Upper bound: 1168.62


In [49]:
# Handle outliers by capping (winsorization)
print("\n6.1 Handling outliers using winsorization...")
for col in numerical_columns:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR

    df_clean[col] = df_clean[col].clip(lower= lower_bound, upper= upper_bound)



6.1 Handling outliers using winsorization...


In [50]:
print("\n" + "="*60)
print("STEP 7: MISSING VALUE IMPUTATION")
print("="*60)

df_imputed = df_clean.copy()


STEP 7: MISSING VALUE IMPUTATION


In [52]:
# 7.1 Impute numerical variables
print("\n7.1 Imputing numerical variables...")

# Mean imputation for age
mean_imputer = SimpleImputer(strategy='mean')
df_imputed['age'] = mean_imputer.fit_transform(df_imputed[['age']])

# Median imputation for income (better for skewed data)
median_imputer = SimpleImputer(strategy='median')
df_imputed['income'] = median_imputer.fit_transform(df_imputed[['income']])

# KNN imputation for credit_score
knn_imputer = KNNImputer(n_neighbors=5)
df_imputed['credit_score'] = knn_imputer.fit_transform(df_imputed[['credit_score']])

print("Missing values after numerical imputation:")
print(df_imputed[numerical_columns].isnull().sum())

# 7.2 Impute categorical variables
print("\n7.2 Imputing categorical variables...")

# Mode imputation for gender
mode_imputer = SimpleImputer(strategy='most_frequent')
df_imputed['gender'] = mode_imputer.fit_transform(df_imputed[['gender']]).ravel()

print("Missing values after categorical imputation:")
print(df_imputed['gender'].isnull().sum())



7.1 Imputing numerical variables...
Missing values after numerical imputation:
age                0
income             0
account_balance    0
credit_score       0
dtype: int64

7.2 Imputing categorical variables...
Missing values after categorical imputation:
0


In [None]:
print("\n7.3 Final missing value check:")
print (df_imputed.isnull().sum())

customer_id               0
age                       0
income                    0
account_balance           0
tenure_months             0
num_products              0
credit_score              0
gender                    0
location                  0
customer_service_calls    0
churned                   0
dtype: int64


In [54]:
 #============================================
# STEP 8: FEATURE ENGINEERING
# ============================================
print("\n" + "="*60)
print("STEP 8: FEATURE ENGINEERING")
print("="*60)



STEP 8: FEATURE ENGINEERING


In [55]:
# Create age groups
df_imputed['age_group'] = pd.cut(df_imputed['age'], 
                                  bins=[0, 30, 50, 70, 100], 
                                  labels=['Young', 'Middle', 'Senior', 'Elderly'])

df_imputed['income_bracket'] = pd.cut(df_imputed['income'], 
                                       bins=[0, 30000, 60000, 100000, np.inf], 
                                       labels=['Low', 'Medium', 'High', 'Very High'])

# Create balance to income ratio
df_imputed['balance_to_income_ratio'] = df_imputed['account_balance'] / (df_imputed['income'] + 1)


In [56]:
 #Create customer value score
df_imputed['customer_value_score'] = (
    df_imputed['tenure_months'] * 0.3 + 
    df_imputed['num_products'] * 20 + 
    df_imputed['account_balance'] * 0.01
)

print("New features created:")
print(df_imputed[['age_group', 'income_bracket', 'balance_to_income_ratio', 'customer_value_score']].head())


New features created:
  age_group income_bracket  balance_to_income_ratio  customer_value_score
0    Senior           High                 0.061164             70.070852
1    Senior         Medium                 0.149763            105.381537
2    Middle           High                 0.092575            172.545998
3    Middle         Medium                 0.153550             85.560047
4    Senior         Medium                 0.000000             23.000000


In [58]:
# ============================================
# STEP 9: ENCODING CATEGORICAL VARIABLES
# ============================================
print("\n" + "="*60)
print("STEP 9: ENCODING CATEGORICAL VARIABLES")
print("="*60)

df_encoded = df_imputed.copy()


STEP 9: ENCODING CATEGORICAL VARIABLES


In [59]:
# 9.1 Label Encoding for ordinal variables
print("\n9.1 Label Encoding for binary/ordinal variables...")
le_gender = LabelEncoder()
df_encoded['gender_encoded'] = le_gender.fit_transform(df_encoded['gender'])
print(f"Gender mapping: {dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))}")



9.1 Label Encoding for binary/ordinal variables...
Gender mapping: {'Female': np.int64(0), 'Male': np.int64(1), 'Other': np.int64(2)}


In [61]:
# 9.2 One-Hot Encoding for nominal variables
print("\n9.2 One-Hot Encoding for nominal variables...")
df_encoded = pd.get_dummies(df_encoded, 
                            columns=['location', 'age_group', 'income_bracket'], 
                            prefix=['location', 'age_group', 'income_bracket'],
                            drop_first=True)

print(f"Columns after encoding: {df_encoded.shape[1]}")
print("\nNew column names:")
print([col for col in df_encoded.columns if '_' in col and col not in df_clean.columns])



9.2 One-Hot Encoding for nominal variables...
Columns after encoding: 21

New column names:
['balance_to_income_ratio', 'customer_value_score', 'gender_encoded', 'location_Suburban', 'location_Urban', 'age_group_Middle', 'age_group_Senior', 'age_group_Elderly', 'income_bracket_Medium', 'income_bracket_High', 'income_bracket_Very High']


In [62]:

# ============================================
# STEP 10: FEATURE SCALING
# ============================================
print("\n" + "="*60)
print("STEP 10: FEATURE SCALING")
print("="*60)


STEP 10: FEATURE SCALING


In [63]:
# Select numerical features to scale
features_to_scale = ['age', 'income', 'account_balance', 'tenure_months', 
                     'credit_score', 'balance_to_income_ratio', 'customer_value_score']

print("\n10.1 Standardization (Z-score normalization)...")
scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])

print("Statistics after scaling:")
print(df_scaled[features_to_scale].describe())



10.1 Standardization (Z-score normalization)...
Statistics after scaling:
                age        income  account_balance  tenure_months  \
count  1.000000e+03  1.000000e+03     1.000000e+03   1.000000e+03   
mean  -1.172396e-16 -1.172396e-16    -1.740830e-16   2.486900e-17   
std    1.000500e+00  1.000500e+00     1.000500e+00   1.000500e+00   
min   -1.811365e+00 -2.657681e+00    -1.778853e+00  -1.749184e+00   
25%   -7.944435e-01 -6.075887e-01    -7.406484e-01  -8.287953e-01   
50%    0.000000e+00 -2.867988e-04     3.562387e-03  -2.716632e-02   
75%    8.439292e-01  6.099393e-01     6.944437e-01   8.338426e-01   
max    1.634868e+00  2.664099e+00     2.847082e+00   1.754231e+00   

       credit_score  balance_to_income_ratio  customer_value_score  
count  1.000000e+03             1.000000e+03          1.000000e+03  
mean   4.423129e-16            -1.421085e-17          2.131628e-17  
std    1.000500e+00             1.000500e+00          1.000500e+00  
min   -1.754400e+00        

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# # Create the scaler
# scaler = MinMaxScaler()

# # Features to normalize
# features_to_scale = ['age', 'income', 'account_balance', 'tenure_months', 
#                      'credit_score', 'balance_to_income_ratio', 'customer_value_score']

# # Apply normalization
# df_normalized = df_encoded.copy()
# df_normalized[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])

# print("Statistics after normalization:")
# print(df_normalized[features_to_scale].describe())

In [64]:

# ============================================
# STEP 11: TRAIN-TEST SPLIT
# ============================================
print("\n" + "="*60)
print("STEP 11: TRAIN-TEST SPLIT")
print("="*60)



STEP 11: TRAIN-TEST SPLIT


In [65]:
# Prepare features and target
X = df_scaled.drop(['customer_id', 'gender', 'churned'], axis=1, errors='ignore')
y = df_scaled['churned']

In [66]:
print(f"\nFeature columns: {X.columns.tolist()}")
print(f"Number of features: {X.shape[1]}")



Feature columns: ['age', 'income', 'account_balance', 'tenure_months', 'num_products', 'credit_score', 'customer_service_calls', 'balance_to_income_ratio', 'customer_value_score', 'gender_encoded', 'location_Suburban', 'location_Urban', 'age_group_Middle', 'age_group_Senior', 'age_group_Elderly', 'income_bracket_Medium', 'income_bracket_High', 'income_bracket_Very High']
Number of features: 18


In [67]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print(f"\nClass distribution in test set:")
print(y_test.value_counts(normalize=True))



Training set size: 800 samples
Test set size: 200 samples

Class distribution in training set:
churned
0    0.7325
1    0.2675
Name: proportion, dtype: float64

Class distribution in test set:
churned
0    0.735
1    0.265
Name: proportion, dtype: float64


In [68]:
# ============================================
# STEP 12: MODEL TRAINING
# ============================================
print("\n" + "="*60)
print("STEP 12: MODEL TRAINING")
print("="*60)


STEP 12: MODEL TRAINING


In [69]:
print("\n12.1 Training Random Forest Classifier...")
model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
model.fit(X_train, y_train)
print("Model training completed!")


12.1 Training Random Forest Classifier...
Model training completed!


In [71]:
# Make predictions
y_pred = model.predict(X_test)

print("\n13.1 Accuracy Score:")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


13.1 Accuracy Score:
Accuracy: 0.7050


In [72]:
print("\n13.2 Classification Report:")
print(classification_report(y_test, y_pred))


13.2 Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.96      0.83       147
           1       0.00      0.00      0.00        53

    accuracy                           0.70       200
   macro avg       0.36      0.48      0.41       200
weighted avg       0.53      0.70      0.61       200



In [73]:
print("\n13.3 Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)


13.3 Confusion Matrix:
[[141   6]
 [ 53   0]]


In [74]:
print("\n13.4 Feature Importance:")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance.head(10))



13.4 Feature Importance:
                   feature  importance
5             credit_score    0.131513
3            tenure_months    0.128013
1                   income    0.115270
8     customer_value_score    0.115097
7  balance_to_income_ratio    0.104510
2          account_balance    0.098171
0                      age    0.091722
6   customer_service_calls    0.066190
4             num_products    0.035265
9           gender_encoded    0.025950


In [1]:
model.feature_importances_

NameError: name 'model' is not defined