In [1]:
import pandas as pd
import numpy as np

In [2]:
n_samples = 1000

data = {
    'customer_id': range(1, n_samples + 1),
    'age': np.random.randint(18, 80, n_samples),
    'income': np.random.normal(50000, 20000, n_samples),
    'account_balance': np.random.normal(5000, 3000, n_samples),
    'tenure_months': np.random.randint(1, 120, n_samples),
    'num_products': np.random.randint(1, 5, n_samples),
    'credit_score': np.random.randint(300, 850, n_samples),
    'gender': np.random.choice(['Male', 'Female', 'Other'], n_samples),
    'location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples),
    'customer_service_calls': np.random.randint(0, 10, n_samples),
    'churned': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
}

df = pd.DataFrame(data)


In [3]:
# Introduce data quality issues
# 1. Missing values
df.loc[np.random.choice(df.index, 50), 'age'] = np.nan
df.loc[np.random.choice(df.index, 80), 'income'] = np.nan
df.loc[np.random.choice(df.index, 30), 'credit_score'] = np.nan
df.loc[np.random.choice(df.index, 20), 'gender'] = np.nan

# 2. Outliers
df.loc[np.random.choice(df.index, 10), 'income'] = np.random.uniform(200000, 500000, 10)
df.loc[np.random.choice(df.index, 5), 'account_balance'] = np.random.uniform(-10000, -5000, 5)

# 3. Inconsistent data
df.loc[np.random.choice(df.index, 10), 'gender'] = df.loc[np.random.choice(df.index, 10), 'gender'].str.lower()

# 4. Duplicates
duplicate_rows = df.sample(5)
df = pd.concat([df, duplicate_rows], ignore_index=True)

print(f"\nDataset created with {len(df)} rows and {len(df.columns)} columns")
print("\nFirst few rows:")
print(df.head())



Dataset created with 1005 rows and 11 columns

First few rows:
   customer_id   age        income  account_balance  tenure_months  \
0            1  43.0  42915.609587      1922.602569              7   
1            2  26.0  93588.127608      3611.023115            108   
2            3  30.0  34703.511563      7180.680581             17   
3            4  65.0  19275.034078      2600.725553             40   
4            5  57.0  56544.310631      6658.325042             49   

   num_products  credit_score  gender  location  customer_service_calls  \
0             4         780.0  Female     Urban                       0   
1             1         776.0   Other  Suburban                       5   
2             1         428.0   Other     Rural                       7   
3             4         803.0    Male  Suburban                       0   
4             1         469.0    Male  Suburban                       7   

   churned  
0        0  
1        1  
2        1  
3        0  

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1005 entries, 0 to 1004
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             1005 non-null   int64  
 1   age                     955 non-null    float64
 2   income                  927 non-null    float64
 3   account_balance         1005 non-null   float64
 4   tenure_months           1005 non-null   int64  
 5   num_products            1005 non-null   int64  
 6   credit_score            974 non-null    float64
 7   gender                  976 non-null    object 
 8   location                1005 non-null   object 
 9   customer_service_calls  1005 non-null   int64  
 10  churned                 1005 non-null   int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 86.5+ KB


In [22]:
missing_value_columns = ['age', 'income', 'credit_score', 'gender']

In [5]:
df.describe()

Unnamed: 0,customer_id,age,income,account_balance,tenure_months,num_products,credit_score,customer_service_calls,churned
count,1005.0,955.0,927.0,1005.0,1005.0,1005.0,974.0,1005.0,1005.0
mean,500.546269,48.950785,53506.739636,5152.889239,60.206965,2.435821,577.876797,4.637811,0.320398
std,288.739104,17.447636,37013.71799,3222.659785,33.568847,1.124302,156.413809,2.894934,0.466862
min,1.0,18.0,-15857.236546,-8628.295194,1.0,1.0,302.0,0.0,0.0
25%,250.0,34.0,36951.558786,3100.916274,32.0,1.0,440.25,2.0,0.0
50%,501.0,49.0,50859.62499,5148.167148,60.0,2.0,573.0,5.0,0.0
75%,750.0,63.0,65021.067748,7385.951136,88.0,3.0,711.0,7.0,1.0
max,1000.0,79.0,473498.966919,15449.43765,119.0,4.0,849.0,9.0,1.0


In [6]:
df.dtypes

customer_id                 int64
age                       float64
income                    float64
account_balance           float64
tenure_months               int64
num_products                int64
credit_score              float64
gender                     object
location                   object
customer_service_calls      int64
churned                     int64
dtype: object

In [10]:
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df))*100

missing_percent_table = pd.DataFrame({"missing_values": missing_values, "missing_percent": missing_percent})

print (missing_percent_table[missing_percent_table["missing_values"] > 0].sort_values("missing_values", ascending=False))

              missing_values  missing_percent
income                    78         7.761194
age                       50         4.975124
credit_score              31         3.084577
gender                    29         2.885572


In [14]:
duplicates = df.duplicated().sum()
print (duplicates)

duplicate_values = df[df.duplicated(keep=False)].sort_values("customer_id")
print (duplicate_values)

5
      customer_id   age        income  account_balance  tenure_months  \
118           119  47.0  31522.079225      9139.843601             55   
1001          119  47.0  31522.079225      9139.843601             55   
243           244  62.0  78248.716428      3411.144926             58   
1000          244  62.0  78248.716428      3411.144926             58   
701           702  69.0  87646.567657      2715.995149             45   
1002          702  69.0  87646.567657      2715.995149             45   
710           711  33.0  50678.860432      4213.261721             10   
1003          711  33.0  50678.860432      4213.261721             10   
772           773  36.0  47601.408143      6242.709581             81   
1004          773  36.0  47601.408143      6242.709581             81   

      num_products  credit_score  gender  location  customer_service_calls  \
118              3         748.0   Other     Urban                       7   
1001             3         748.0   Oth

In [15]:
df['gender'] = df['gender'].str.title()
df['location'] = df['location'].str.title()

In [21]:
numerical_columns = df.select_dtypes(include=[np.number]).columns

correlation_matrix = df[numerical_columns].corr()

print (correlation_matrix['churned'].sort_values(ascending= False))

churned                   1.000000
credit_score              0.086171
num_products              0.018341
customer_service_calls    0.001934
income                   -0.024280
account_balance          -0.029895
age                      -0.040228
customer_id              -0.052341
tenure_months            -0.058193
Name: churned, dtype: float64


In [23]:
from sklearn.impute import SimpleImputer

In [24]:
missing_value_columns

['age', 'income', 'credit_score', 'gender']

In [31]:
from sklearn.impute import SimpleImputer, KNNImputer

age_imputer = SimpleImputer(strategy='mean')
df['age'] = age_imputer.fit_transform(df[['age']])

income_imputer = SimpleImputer(strategy='median')

df['income'] = income_imputer.fit_transform(df[['income']])

knn_imputer = KNNImputer(n_neighbors=5)
df['credit_score'] = knn_imputer.fit_transform(df[['credit_score']])

gender_imputer = SimpleImputer(strategy='most_frequent')
df['gender'] = gender_imputer.fit_transform(df[['gender']]).ravel()


In [32]:
df = df.drop_duplicates()

In [33]:
df.shape

(1000, 11)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             1000 non-null   int64  
 1   age                     1000 non-null   float64
 2   income                  1000 non-null   float64
 3   account_balance         1000 non-null   float64
 4   tenure_months           1000 non-null   int64  
 5   num_products            1000 non-null   int64  
 6   credit_score            1000 non-null   float64
 7   gender                  1000 non-null   object 
 8   location                1000 non-null   object 
 9   customer_service_calls  1000 non-null   int64  
 10  churned                 1000 non-null   int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 93.8+ KB


In [41]:
def detect_outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    IQR = q3-q1
    lower_bound = q1-1.5*IQR
    upper_bound = q3 + 1.5*IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outliers, lower_bound, upper_bound

In [42]:
numerical_columns = ['age', 'income', 'account_balance', 'tenure_months', 'num_products', 'credit_score', 'customer_service_calls']

In [43]:
for col in numerical_columns:
    outliers, lower_bound, upper_bound = detect_outliers(df, col)
    df[col] = df[col].clip(lower = lower_bound, upper = upper_bound)

In [44]:
df.head()

Unnamed: 0,customer_id,age,income,account_balance,tenure_months,num_products,credit_score,gender,location,customer_service_calls,churned
0,1,43.0,42915.609587,1922.602569,7,4,780.0,Female,Urban,0,0
1,2,26.0,93588.127608,3611.023115,108,1,776.0,Other,Suburban,5,1
2,3,30.0,34703.511563,7180.680581,17,1,428.0,Other,Rural,7,1
3,4,65.0,19275.034078,2600.725553,40,4,803.0,Male,Suburban,0,0
4,5,57.0,56544.310631,6658.325042,49,1,469.0,Male,Suburban,7,1


In [46]:
df['age_bucket'] = pd.cut(df['age'],
                          bins = [0, 30, 50, 70, 100],
                          labels = ['Young', 'Mid', 'Senior', 'Old'])

In [47]:
df.dtypes

customer_id                  int64
age                        float64
income                     float64
account_balance            float64
tenure_months                int64
num_products                 int64
credit_score               float64
gender                      object
location                    object
customer_service_calls       int64
churned                      int64
age_bucket                category
dtype: object

In [48]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

features_to_scale = ['age', 'income', 'account_balance', 'tenure_months', 'num_products', 'credit_score', 'customer_service_calls']

df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

In [49]:
df.describe()

Unnamed: 0,customer_id,age,income,account_balance,tenure_months,num_products,credit_score,customer_service_calls,churned
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,-2.4869e-16,-1.447731e-16,2.113865e-16,-2.664535e-17,-4.9737990000000006e-17,-4.121148e-16,6.750156e-17,0.32
std,288.819436,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,1.0005,0.46671
min,1.0,-1.819186,-2.575452,-2.708514,-1.764289,-1.276489,-1.790545,-1.610008,0.0
25%,250.75,-0.8199089,-0.6388962,-0.6603668,-0.8413413,-1.276489,-0.855644,-0.9176802,0.0
50%,500.5,0.0001320264,-0.005046652,-0.007158641,-0.00771108,-0.3869496,0.0005485584,0.1208112,0.0
75%,750.25,0.8259587,0.6521408,0.7050648,0.8259192,0.5025897,0.8469793,0.8131388,1.0
max,1000.0,1.766455,2.588696,2.753212,1.748867,1.392129,1.76078,1.505466,1.0


In [50]:
df.columns

Index(['customer_id', 'age', 'income', 'account_balance', 'tenure_months',
       'num_products', 'credit_score', 'gender', 'location',
       'customer_service_calls', 'churned', 'age_bucket'],
      dtype='object')

In [52]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le_encoder = LabelEncoder()

df['age_bucket_encoder'] = le_encoder.fit_transform(df['age_bucket'])

df = pd.get_dummies(df, columns=['gender', 'location'], prefix=['gender', 'location'], drop_first=True)

In [53]:
df.columns

Index(['customer_id', 'age', 'income', 'account_balance', 'tenure_months',
       'num_products', 'credit_score', 'customer_service_calls', 'churned',
       'age_bucket', 'age_bucket_encoder', 'gender_Male', 'gender_Other',
       'location_Suburban', 'location_Urban'],
      dtype='object')

In [55]:
x_data = df.drop(['customer_id', 'churned', 'age_bucket'], axis = 1, errors='ignore')
y_data = df['churned']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.2, stratify=Y)

In [57]:
X_train.shape

(750, 12)

In [58]:
y_train.shape

(750,)

In [60]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state= 42, max_depth=10)

model.fit(X_train, y_train)

In [61]:
y_pred = model.predict(X_test)

In [63]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print (accuracy_score(y_test, y_pred))

0.712


In [65]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.94      0.83       181
           1       0.42      0.12      0.18        69

    accuracy                           0.71       250
   macro avg       0.58      0.53      0.50       250
weighted avg       0.65      0.71      0.65       250



In [69]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, regularizers




In [71]:
model =keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')

], name='simple_nn')

In [72]:
model.summary()

In [75]:
model.compile(
    # optimizer = keras.optimizers.Adam(learning_rate=0.001)
    optimizer = keras.optimizers.SGD(learning_rate=0.1, momentum=0.9),
    loss = 'binary_crossentropy',
    metrics = ['accuracy',
               keras.metrics.AUC(name='auc'),
               keras.metrics.Precision(name='precision'),
               keras.metrics.Recall(name='recall')]
)

In [76]:
model_fit = model.fit(
    X_train, y_train,
    validation_data = (X_test, y_test),
    epochs = 100,
    batch_size = 32
)

Epoch 1/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5943 - auc: 0.5422 - loss: 0.6825 - precision: 0.3785 - recall: 0.3100 - val_accuracy: 0.7240 - val_auc: 0.5097 - val_loss: 0.6027 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6647 - auc: 0.5604 - loss: 0.6302 - precision: 0.3958 - recall: 0.0365 - val_accuracy: 0.7240 - val_auc: 0.5115 - val_loss: 0.5926 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6763 - auc: 0.5184 - loss: 0.6310 - precision: 0.0600 - recall: 4.8401e-04 - val_accuracy: 0.7240 - val_auc: 0.5081 - val_loss: 0.6073 - val_precision: 0.5000 - val_recall: 0.0145
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6905 - auc: 0.5589 - loss: 0.6275 - precision: 0.5262 -