In [1]:
# Import the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Check if proper versions are used
print(pd.__version__)
print(np.__version__)

1.1.3
1.19.2


In [2]:
# Normalization & Scaling Functions using Numpy & Pandas

# Outlier Scaling using .quantile() Pandas methods
def scale_outlier(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_bound = Q1 - 1.5*IQR
    max_bound = Q3 + 1.5*IQR
    df[column] = np.where(df[column] > max_bound, max_bound, df[column])
    df[column] = np.where(df[column] < min_bound, min_bound, df[column])

# Min-Max Scaling using .min() and .max() Pandas methods
def min_max_scaling(df):    
    df_norm = df.copy()
    for column in df_norm.columns:
        df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())        
    return df_norm

In [3]:
# Reading the dataset using Pandas
df = pd.read_csv("LBW_Dataset.csv")

In [4]:
# Data Preprocessing

# Drop the columns Delivery Phase(1: 90, 2: 2, NaN: 4) and Education(5: 93, NaN: 3)
df = df.drop(["Delivery phase", "Education", "Community"], axis = 1)

# Not sure if this is Proper, what if testing set has Community = 2?
# Replacing Community = 2(count = 1) with Community = 1
# df["Community"] = np.where(df["Community"] == 2, 1, df["Community"])ss

# Replacing Nan of Weights with the Mean of its respective Result category
mean_0 = (df.loc[df['Result'] == 0])['Weight'].mean()
mean_1 = (df.loc[df['Result'] == 1])['Weight'].mean()

df["Weight"] = np.where((df["Result"] == 0) & (df["Weight"].isna()), mean_0, df["Weight"])
df["Weight"] = np.where((df["Result"] == 1) & (df["Weight"].isna()), mean_1, df["Weight"])

# For now, Filling Numeric Columned NaN Values with Mean
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["HB"] = df["HB"].fillna(df["HB"].mean())
df["BP"] = df["BP"].fillna(df["BP"].mean())
# df["Weight"] = df["Weight"].fillna(df["Weight"].mean()) -> cleaned using the above method

# Very Basic Method of taking care of Outliers(Replace with IQR, Min-Max) for Age & BP columns
scale_outlier(df, "Age")
scale_outlier(df, "BP")

# Labelling Residence = 2 as Residence = 0 to get Binary Labelled Column (Before: Residence(1,2), After: Residence(1,0))
df["Residence"] = np.where(df["Residence"] == 2, 0, df["Residence"])
# Filling NaN with Mode = 1
df["Residence"] = df["Residence"].fillna(1)

# One-Hot-Encode Community(1,3,4) to Community_1(1,0), Community_3(1,0), Community_4(1,0)
# df = pd.get_dummies(df, columns=["Community"], dtype = float)

# Converting IFA(int) to IFA(float)
df["IFA"] = df["IFA"].astype(float)

# Moving converted Float Result, to get it as the last Column
res = df["Result"].astype(float)
df = df.drop(["Result"], axis = 1)
df["Result"] = res

In [5]:
# Performing Normalization of the dataset (into ranges from 0 to 1) using Pandas
df = min_max_scaling(df)

In [6]:
df.describe()

Unnamed: 0,Age,Weight,HB,IFA,BP,Residence,Result
count,96.0,96.0,96.0,96.0,96.0,96.0,96.0
mean,0.452382,0.402381,0.622867,0.6875,0.358478,0.864583,0.75
std,0.185433,0.235995,0.138207,0.465946,0.199033,0.343964,0.435286
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.273504,0.257143,0.607843,0.0,0.17103,1.0,0.75
50%,0.478632,0.342857,0.622867,1.0,0.363002,1.0,1.0
75%,0.564103,0.55,0.647059,1.0,0.502618,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [127]:
# Creating Train-Test Splits of the dataset using .train_test_split() in Sklearn
X = df.iloc[:,:-1].values
y = df.iloc[:,-1:].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [128]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,10), activation='tanh', solver='adam', learning_rate = 'constant'\
                    , alpha = 0.00001, max_iter = 20000, random_state = 0)
mlp.out_activation_ = 'tanh'
mlp.fit(X_train,y_train.ravel())

predict_train = mlp.predict(X_train)
print(predict_train)
predict_test = mlp.predict(X_test)
print(predict_test)

[1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1.
 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.]
[1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.]


In [129]:
print(mlp.n_outputs_, mlp.n_layers_, mlp.n_iter_)
print(len(predict_train))

1 4 441
72


In [130]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))

[[17  5]
 [ 3 47]]
              precision    recall  f1-score   support

         0.0       0.85      0.77      0.81        22
         1.0       0.90      0.94      0.92        50

    accuracy                           0.89        72
   macro avg       0.88      0.86      0.87        72
weighted avg       0.89      0.89      0.89        72



In [131]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[ 2  0]
 [ 2 20]]
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67         2
         1.0       1.00      0.91      0.95        22

    accuracy                           0.92        24
   macro avg       0.75      0.95      0.81        24
weighted avg       0.96      0.92      0.93        24



In [132]:
predict_total = mlp.predict(X)
print(confusion_matrix(y,predict_total))
print(classification_report(y,predict_total))

[[19  5]
 [ 5 67]]
              precision    recall  f1-score   support

         0.0       0.79      0.79      0.79        24
         1.0       0.93      0.93      0.93        72

    accuracy                           0.90        96
   macro avg       0.86      0.86      0.86        96
weighted avg       0.90      0.90      0.90        96

