In [1]:
# Import the libraries
import numpy as np
import pandas as pd

# Check if proper versions are used
print(pd.__version__)
print(np.__version__)

0.24.2
1.16.2


In [2]:
# Normalization & Scaling Functions

# Outlier Scaling using .quantile() Pandas methods
def scale_outlier(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_bound = Q1 - 1.5*IQR
    max_bound = Q3 + 1.5*IQR
    df[column] = np.where(df[column] > max_bound, max_bound, df[column])
    df[column] = np.where(df[column] < min_bound, min_bound, df[column])

# Min-Max Scaling using .min() and .max() Pandas methods
def min_max_scaling(df):    
    df_norm = df.copy()
    for column in df_norm.columns:
        df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())        
    return df_norm

In [3]:
# Read the dataset
df = pd.read_csv("LBW_Dataset.csv")

In [4]:
# Data Preprocessing

# Drop the columns Delivery Phase(1: 90, 2: 2, NaN: 4) and Education(5: 93, NaN: 3)
df = df.drop(["Delivery phase", "Education"], axis = 1)

# Not sure if this is Proper, what if testing set has Community = 2?
# Replacing Community = 2(count = 1) with Community = 1
df["Community"] = np.where(df["Community"] == 2, 1, df["Community"])

# For now, Filling Numeric Columned NaN Values with Mean
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Weight"] = df["Weight"].fillna(df["Weight"].mean())
df["HB"] = df["HB"].fillna(df["HB"].mean())
df["BP"] = df["BP"].fillna(df["BP"].mean())

# Very Basic Method of taking care of Outliers(Replace with IQR, Min-Max) for Age & BP columns
scale_outlier(df, "Age")
scale_outlier(df, "BP")

# Labelling Residence = 2 as Residence = 0 to get Binary Labelled Column (Before: Residence(1,2), After: Residence(1,0))
df["Residence"] = np.where(df["Residence"] == 2, 0, df["Residence"])
# Filling NaN with Mode = 1
df["Residence"] = df["Residence"].fillna(1)

# One-Hot-Encode Community(1,3,4) to Community_1(1,0), Community_3(1,0), Community_4(1,0)
df = pd.get_dummies(df, columns=["Community"], dtype = float)

# Converting IFA(int) to IFA(float)
df["IFA"] = df["IFA"].astype(float)

# Moving converted Float Result, to get it as the last Column
res = df["Result"].astype(float)
df = df.drop(["Result"], axis = 1)
df["Result"] = res

In [5]:
# Normalization of the dataset (into ranges from 0 to 1)
df = min_max_scaling(df)

In [6]:
df.describe()

Unnamed: 0,Age,Weight,HB,IFA,BP,Residence,Community_1,Community_3,Community_4,Result
count,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0
mean,0.452382,0.432269,0.622867,0.6875,0.358478,0.864583,0.5,0.333333,0.166667,0.75
std,0.185433,0.220722,0.138207,0.465946,0.199033,0.343964,0.502625,0.473879,0.374634,0.435286
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.273504,0.285714,0.607843,0.0,0.17103,1.0,0.0,0.0,0.0,0.75
50%,0.478632,0.428571,0.622867,1.0,0.363002,1.0,0.5,0.0,0.0,1.0
75%,0.564103,0.55,0.647059,1.0,0.502618,1.0,1.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
df.dtypes

Age            float64
Weight         float64
HB             float64
IFA            float64
BP             float64
Residence      float64
Community_1    float64
Community_3    float64
Community_4    float64
Result         float64
dtype: object

In [8]:
df

Unnamed: 0,Age,Weight,HB,IFA,BP,Residence,Community_1,Community_3,Community_4,Result
0,0.273504,0.342857,0.647059,1.0,0.171030,1.0,1.0,0.0,0.0,0.0
1,0.273504,0.432269,0.568627,1.0,0.293194,1.0,1.0,0.0,0.0,0.0
2,0.273504,0.432269,0.647059,1.0,0.904014,1.0,1.0,0.0,0.0,0.0
3,0.273504,0.432269,0.411765,1.0,0.171030,1.0,1.0,0.0,0.0,0.0
4,0.478632,0.085714,0.666667,1.0,0.362583,1.0,1.0,0.0,0.0,0.0
5,0.615385,0.142857,0.647059,1.0,0.363002,1.0,1.0,0.0,0.0,0.0
6,0.615385,0.028571,0.622867,1.0,0.415358,1.0,0.0,0.0,1.0,0.0
7,0.341880,0.000000,0.622867,1.0,0.513274,1.0,1.0,0.0,0.0,0.0
8,0.000000,0.000000,0.607843,0.0,0.171030,1.0,0.0,0.0,1.0,0.0
9,1.000000,0.432269,0.784314,1.0,0.363002,1.0,0.0,1.0,0.0,0.0
