<a href="https://colab.research.google.com/github/VirunaVidaswin/Machine-Learning/blob/main/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

1. Loading Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

df = pd.read_csv('/content/bank-full.csv', sep=';')
print(f"\nTotal Rows: {df.shape[0]}")
print(f"Total Cols: {df.shape[1]}")
print("\nDataset Head:")
print(df.describe())




Total Rows: 45211
Total Cols: 17

Dataset Head:
                age        balance           day      duration      campaign  \
count  45211.000000   45211.000000  45211.000000  45211.000000  45211.000000   
mean      40.936210    1362.272058     15.806419    258.163080      2.763841   
std       10.618762    3044.765829      8.322476    257.527812      3.098021   
min       18.000000   -8019.000000      1.000000      0.000000      1.000000   
25%       33.000000      72.000000      8.000000    103.000000      1.000000   
50%       39.000000     448.000000     16.000000    180.000000      2.000000   
75%       48.000000    1428.000000     21.000000    319.000000      3.000000   
max       95.000000  102127.000000     31.000000   4918.000000     63.000000   

              pdays      previous  
count  45211.000000  45211.000000  
mean      40.197828      0.580323  
std      100.128746      2.303441  
min       -1.000000      0.000000  
25%       -1.000000      0.000000  
50%       -1.0

2. Categorical column data values

In [None]:
for column in df.select_dtypes('object'):
    print(f"\nColumn: {column} , "+f" Values: {df[column].unique()}")




Column: job ,  Values: ['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']

Column: marital ,  Values: ['married' 'single' 'divorced']

Column: education ,  Values: ['tertiary' 'secondary' 'unknown' 'primary']

Column: default ,  Values: ['no' 'yes']

Column: housing ,  Values: ['yes' 'no']

Column: loan ,  Values: ['no' 'yes']

Column: contact ,  Values: ['unknown' 'cellular' 'telephone']

Column: month ,  Values: ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']

Column: poutcome ,  Values: ['unknown' 'failure' 'other' 'success']

Column: y ,  Values: ['no' 'yes']


Missing values

In [None]:
missing_values = df.isnull().sum()
print(missing_values)
 for col in df.columns:
  print(col.df[])

IndentationError: unexpected indent (<ipython-input-34-780c328f1c50>, line 3)

In [None]:
# Encode categorical variables
categorical_cols = data.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# Define features (X) and target (y)
X = data.drop(columns=['y'])  # 'y' is the target column
y = data['y']

# Balance the dataset if the target class is imbalanced
print("\nClass Distribution Before Balancing:")
print(y.value_counts())
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after balancing
print("\nClass Distribution After Balancing:")
print(pd.Series(y_resampled).value_counts())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the processed data for model building
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

print("\nData preprocessing completed. Files saved for training and testing.")

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Load the preprocessed data
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

# Random Forest Model
print("\n--- Random Forest Classifier ---")
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Random Forest Predictions
rf_predictions = rf_model.predict(X_test)
rf_probabilities = rf_model.predict_proba(X_test)[:, 1]

# Random Forest Evaluation
print("Classification Report:")
print(classification_report(y_test, rf_predictions))
print("ROC-AUC Score:", roc_auc_score(y_test, rf_probabilities))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

# Neural Network Model
print("\n--- Neural Network Classifier ---")
nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)

# Neural Network Predictions
nn_predictions = nn_model.predict(X_test)
nn_probabilities = nn_model.predict_proba(X_test)[:, 1]

# Neural Network Evaluation
print("Classification Report:")
print(classification_report(y_test, nn_predictions))
print("ROC-AUC Score:", roc_auc_score(y_test, nn_probabilities))
print("Confusion Matrix:")
print(confusion_matrix(y_test, nn_predictions))

# Comparison Summary
print("\n--- Model Comparison ---")
print("Random Forest ROC-AUC:", roc_auc_score(y_test, rf_probabilities))
print("Neural Network ROC-AUC:", roc_auc_score(y_test, nn_probabilities))



--- Random Forest Classifier ---
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      7985
           1       0.87      0.93      0.90      7984

    accuracy                           0.89     15969
   macro avg       0.89      0.89      0.89     15969
weighted avg       0.89      0.89      0.89     15969

ROC-AUC Score: 0.9582783287300964
Confusion Matrix:
[[6838 1147]
 [ 572 7412]]

--- Neural Network Classifier ---
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      7985
           1       0.90      0.90      0.90      7984

    accuracy                           0.90     15969
   macro avg       0.90      0.90      0.90     15969
weighted avg       0.90      0.90      0.90     15969

ROC-AUC Score: 0.963326292534976
Confusion Matrix:
[[7213  772]
 [ 828 7156]]

--- Model Comparison ---
Random Forest ROC-AUC: 0.9582783287300964
Neural