## Data preprocessing

### drop unnescessary columns

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.init as init

import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plts
import seaborn as sns
%matplotlib inline

data = pd.read_csv('/Users/ponynie/Developer/Python_Code/IntroDMLabChula/Final_Project/children anemia.csv')
data.dropna(inplace=True)

data.drop(['Anemia level.1', 'Hemoglobin level adjusted for altitude and smoking (g/dl - 1 decimal)', 'When child put to breast', 'Current marital status'], axis=1, inplace=True)

data = data[data['Had fever in last two weeks'].isin(['Yes', 'No'])]
data = data[data['Taking iron pills, sprinkles or syrup'].isin(['Yes', 'No'])]

data.head(10)


### Maps categorical to codes

In [None]:
categorical_columns = ['Age in 5-year groups', 'Type of place of residence', 'Highest educational level', 'Wealth index combined', 'Have mosquito bed net for sleeping (from household questionnaire)', 'Smokes cigarettes', 'Currently residing with husband/partner', 'Had fever in last two weeks', 'Taking iron pills, sprinkles or syrup']
numerical_columns = ['Births in last five years', 'Age of respondent at 1st birth', 'Hemoglobin level adjusted for altitude (g/dl - 1 decimal)']
label_columns = ['Anemia level']

for category in categorical_columns:
    data[category] = data[category].astype('category')
data['Anemia level'] = data['Anemia level'].astype('category')

for categorical in categorical_columns:
    print(data[categorical].cat.categories, categorical)
print(data['Anemia level'].cat.categories, "Label")

### Normalized numerical columns

In [None]:

for col in numerical_columns:
    data[col] = (data[col] - data[col].min())/ (data[col].max() - data[col].min())

data[numerical_columns].head(10)

In [None]:
for col in categorical_columns:
    data[col] = (data[col].cat.codes.values - data[col].cat.codes.values.min()) / (data[col].cat.codes.values.max() - data[col].cat.codes.values.min())
    
data[categorical_columns].head(10)

### Create nparray of categorical matrix

In [None]:
categorical_np = [data[i] for i in categorical_columns]
categorical_data = np.stack(categorical_np, 1)
categorical_data[:10]

### Convert to Categorical Tensor

In [None]:
categorical_data = torch.tensor(categorical_data, dtype=torch.float64)
categorical_data[:10]

### Create nparray of numerical matrix and convert to Tensor

In [None]:
numerical_data = np.stack([data[i].values for i in numerical_columns], 1)
numerical_data = torch.tensor(numerical_data, dtype=torch.float)
numerical_data[:10]

### Create label's Tensor

In [None]:
outputs = torch.tensor(data['Anemia level'].cat.codes.values).flatten()
outputs[200:250]

### Check correctness of dimension

In [None]:
categorical_data.shape, numerical_data.shape, outputs.shape

## Create Model

### Split the data to train and test set

In [None]:
total_records = data.shape[0]
test_records = int(total_records * .2) # 20% of the data for testing
train_records = total_records - test_records # 80% of the data for training

categorical_train_data = categorical_data[:train_records]
categorical_test_data = categorical_data[train_records:]
numerical_train_data = numerical_data[:train_records]
numerical_test_data = numerical_data[train_records:]
train_outputs = outputs[:train_records]
test_outputs = outputs[train_records:]

print(categorical_train_data.shape, categorical_test_data.shape)
print(numerical_train_data.shape, numerical_test_data.shape)
print(train_outputs.shape, test_outputs.shape)



In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, numerical_features_size, categorical_features_size, hidden_size, output_size):
        super().__init__()
        all_features_size = numerical_features_size + categorical_features_size
        
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(all_features_size, hidden_size),
            nn.BatchNorm1d(hidden_size),  
            nn.Sigmoid(),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),  
            nn.Sigmoid(),
            nn.Linear(hidden_size, output_size),
        )
        # Initialize weights
        for layer in self.linear_relu_stack:
            if isinstance(layer, nn.Linear):
                init.xavier_uniform_(layer.weight)

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
train_data = torch.cat((categorical_train_data, numerical_train_data), dim=1).requires_grad_(True)
test_data = torch.cat((categorical_test_data, numerical_test_data), dim=1).requires_grad_(True)
print(train_data)


In [None]:
train_data = train_data.float()
test_data = test_data.float()
train_outputs = train_outputs.long()
test_outputs = test_outputs.long()
print(train_outputs)

In [None]:
numerical_features_size = numerical_data.shape[1]
categorical_features_size = categorical_data.shape[1]
hidden_size = 6
num_epochs = 20
learning_rate = 0.1
output_size = 4
batch_size = 100

model = NeuralNetwork(numerical_features_size, categorical_features_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
model.to(device)
train_data = train_data.to(device)
test_data = test_data.to(device)
train_outputs = train_outputs.to(device)
test_outputs = test_outputs.to(device)

In [None]:
train_dataset = TensorDataset(train_data, train_outputs)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if batch_idx % 10 == 9:  # Print every 100 batches
            print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {running_loss / 100:.4f}")
            running_loss = 0.0

    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss.item()}")
print("Training complete.")


In [None]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    correct = 0
    total = 0
    outputs = model(test_data)
    _, predicted = torch.max(outputs.data, 1)
    print(predicted)
    print(test_outputs)
    total += test_outputs.size(0)
    correct += (predicted == test_outputs).sum().item()
    print(correct)
    print(total)

print(f"Accuracy of the model on the test data: {(100 * correct / total)}%")