<a href="https://colab.research.google.com/github/Priyanka24322/Data-Visualization-Analysis-Projects/blob/main/ML_Natailty_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Priyanka24322/Data-Visualization-Analysis-Projects/refs/heads/main/Datasets/natality_dataset_0903_02.csv')
df.columns

Index(['source_year', 'year', 'month', 'wday', 'is_male', 'weight_pounds',
       'plurality', 'apgar_5min', 'mother_race', 'mother_age',
       'gestation_weeks', 'lmp', 'mother_married', 'cigarette_use',
       'cigarettes_per_day', 'alcohol_use', 'drinks_per_week',
       'weight_gain_pounds', 'born_alive_alive', 'born_alive_dead',
       'born_dead', 'ever_born', 'father_race', 'father_age', 'record_weight'],
      dtype='object')

In [None]:
def add_week_day(row):
    if row['wday'] == 1:
        return 'Sunday'
    elif row['wday'] == 2:
        return 'Monday'
    elif row['wday'] == 3:
        return 'Tuesday'
    elif row['wday'] == 4:
        return 'Wednesday'
    elif row['wday'] == 5:
        return 'Thursday'
    elif row['wday'] == 6:
        return 'Friday'
    elif row['wday'] == 7:
        return 'Saturday'

df['Week_day'] = df.apply(add_week_day, axis =1)

In [None]:
df.columns

Index(['source_year', 'year', 'month', 'wday', 'is_male', 'weight_pounds',
       'plurality', 'apgar_5min', 'mother_race', 'mother_age',
       'gestation_weeks', 'lmp', 'mother_married', 'cigarette_use',
       'cigarettes_per_day', 'alcohol_use', 'drinks_per_week',
       'weight_gain_pounds', 'born_alive_alive', 'born_alive_dead',
       'born_dead', 'ever_born', 'father_race', 'father_age', 'record_weight',
       'Week_day'],
      dtype='object')

In [None]:
def infant_health(row):
  if row['weight_pounds'] > 6.0 and 7 <= row['apgar_5min'] <= 10:
    return 'normal'
  elif row['weight_pounds'] > 6.0 and 4 <= row['apgar_5min'] <= 6:
    return 'moderately abnormal'
  else:
    return 'abnormal'

df['infant_health'] = df.apply(infant_health, axis = 1)

In [None]:
def birth_status(row):
  if row['gestation_weeks'] >= 37:
    return 'Full Term'
  elif 32 <= row['gestation_weeks'] <= 36:
    return 'Moderate Preterm'
  elif 28 <= row['gestation_weeks'] <= 31:
    return 'Very Preterm'
  elif row['gestation_weeks'] < 28:
    return 'Extremely Preterm'

df['birth_status'] = df.apply(birth_status, axis = 1)

In [None]:
def gender(row):
  if row['is_male'] == False:
    return 'GIRL'
  else:
    return 'BOY'

df['gender'] = df.apply(gender, axis = 1)

In [None]:
df.columns

Index(['source_year', 'year', 'month', 'wday', 'is_male', 'weight_pounds',
       'plurality', 'apgar_5min', 'mother_race', 'mother_age',
       'gestation_weeks', 'lmp', 'mother_married', 'cigarette_use',
       'cigarettes_per_day', 'alcohol_use', 'drinks_per_week',
       'weight_gain_pounds', 'born_alive_alive', 'born_alive_dead',
       'born_dead', 'ever_born', 'father_race', 'father_age', 'record_weight',
       'Week_day', 'infant_health', 'birth_status', 'gender'],
      dtype='object')

In [None]:
correlation_matrix = df.corr()
correlation_with_weight_pounds = correlation_matrix['weight_pounds']
correlation_with_apgar_5min = correlation_matrix['apgar_5min']

  correlation_matrix = df.corr()


In [None]:
correlation_with_apgar_5min

source_year          -0.024641
year                 -0.024641
month                -0.001454
wday                 -0.009386
is_male              -0.025592
weight_pounds         0.213861
plurality            -0.065473
apgar_5min            1.000000
mother_race          -0.019186
mother_age            0.010709
gestation_weeks       0.247116
mother_married        0.023993
cigarette_use         0.015075
cigarettes_per_day    0.017369
alcohol_use           0.015075
drinks_per_week      -0.019952
weight_gain_pounds    0.004998
born_alive_alive      0.035634
born_alive_dead      -0.004707
born_dead             0.003290
ever_born             0.036354
father_race          -0.015427
father_age            0.013519
record_weight              NaN
Name: apgar_5min, dtype: float64

In [None]:
columns_to_be_selected = ['is_male', 'plurality', 'mother_age', 'gestation_weeks', 'cigarette_use', 'alcohol_use', 'weight_gain_pounds', 'father_age', 'cigarettes_per_day', 'ever_born','mother_married', 'infant_health']
df_selected = df[columns_to_be_selected]
df_selected = df[columns_to_be_selected].dropna()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_selected['is_male'] = label_encoder.fit_transform(df_selected['is_male'])
df_selected['cigarette_use'] = label_encoder.fit_transform(df_selected['cigarette_use'])
df_selected['alcohol_use'] = label_encoder.fit_transform(df_selected['alcohol_use'])
df_selected['mother_married'] = label_encoder.fit_transform(df_selected['mother_married'])
df_selected['infant_health'] = label_encoder.fit_transform(df_selected['infant_health'])

In [None]:
X = df_selected.drop('infant_health', axis = 1)
y = df_selected['infant_health']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
voting_clf = VotingClassifier(
    estimators = [
        ('rf', RandomForestClassifier(n_estimators = 500, random_state = 42)),
        ('svc', SVC(random_state = 42))
    ]
)
voting_clf.fit(X_train, y_train)

for name, clf in voting_clf.named_estimators_.items():
  print(name, '=', clf.score(X_test, y_test))

rf = 0.8910946196660482
svc = 0.8818181818181818


In [None]:
voting_clf.score(X_test, y_test)

0.8910946196660482

### Random Forest Classifier

In [None]:
rf_classifier_1 = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf_classifier_1.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier_1.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy

0.8912801484230055

In [None]:
rf_classifier_2 = RandomForestClassifier(n_estimators = 500, max_leaf_nodes=16, n_jobs= -1, random_state = 42)
rf_classifier_2.fit(X_train, y_train)

In [None]:
y_pred_1 = rf_classifier_2.predict(X_test)
accuracy_1 = accuracy_score(y_pred_1, y_test)
accuracy_1

0.8972170686456401

In [None]:
# Feature importance
for score, name in zip(rf_classifier_2.feature_importances_, df_selected.columns):
  print(round(score, 5), name)

0.00284 is_male
0.27495 plurality
0.00999 mother_age
0.65545 gestation_weeks
0.00119 cigarette_use
0.00122 alcohol_use
0.02999 weight_gain_pounds
0.00635 father_age
0.0044 cigarettes_per_day
0.00941 ever_born
0.00422 mother_married


In [None]:
rf_classifier_3 = RandomForestClassifier(n_estimators = 850, max_leaf_nodes=20, max_depth = 12, random_state = 42)
rf_classifier_3.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier_3.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8985157699443413

### Adaboost Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth = 1), n_estimators =  30,
    learning_rate = 0.5, random_state = 42
)
ada_clf.fit(X_train, y_train)

In [None]:
y_pred = ada_clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8942486085343229

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
gbrt_classifier.fit(X_train, y_train)

In [None]:
accuracy = gbrt_classifier.score(X_test, y_test)
accuracy

0.9027829313543599

### SVM classifier

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(random_state = 42)
svm_clf.fit(X_train, y_train)

In [None]:
y_pred = svm_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8820037105751392

### One Vs rest Classifier

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_clf = OneVsRestClassifier(SVC(random_state = 42))
ovr_clf.fit(X_train, y_train)

In [None]:
y_pred = ovr_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8820037105751392

### Scaling the features and then implementing SVC

In [None]:
scaler = StandardScaler()

pipe1 = make_pipeline(StandardScaler(), SVC())
pipe1.fit(X_train, y_train)

In [None]:
y_pred = pipe1.predict(X_test)

In [None]:
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8998144712430427

### KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

In [None]:
y_pred = knn_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8888682745825603

### Stacking Classifier

In [None]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators = [
        ('rf', RandomForestClassifier(n_estimators = 800, random_state = 42)),
        ('svc', SVC(probability=True, random_state = 42))
    ],
    final_estimator = RandomForestClassifier(random_state = 42),
    cv = 5
)
stacking_clf.fit(X_train, y_train)

In [None]:
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
accuracy

0.8972170686456401