In [None]:
import torch 
import torch.nn as nn 
from torch.autograd import Variable

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.express as px

from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('Data/Healthcare-Diabetes.csv')

Columns:

- Id: Unique identifier for each data entry.
- Pregnancies: Number of times pregnant.
- Glucose: Plasma glucose concentration over 2 hours in an oral glucose tolerance test.
- BloodPressure: Diastolic blood pressure (mm Hg).
- SkinThickness: Triceps skinfold thickness (mm).
- Insulin: 2-Hour serum insulin (mu U/ml).
- BMI: Body mass index (weight in kg / height in m^2).
- DiabetesPedigreeFunction: Diabetes pedigree function, a genetic score of diabetes.
- Age: Age in years.
- Outcome: Binary classification indicating the presence (1) or absence (0) of diabetes.

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
correlation = data.corr()
correlation 

In [None]:
# Create a heatmap
plt.figure(figsize=(8,6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
columns = ['Age', 'Insulin', 'Glucose', 'Pregnancies', 'BMI', 'Outcome']
colors = ['blue', 'green', 'red', 'purple', 'pink', 'brown']
titles = ['Distribution of ' + col for col in columns]

fig, axs = plt.subplots(2, 3, figsize=(12,6))
axs = axs.flatten()  # For easier iteration

for ax, col, color, title in zip(axs, columns, colors, titles):
    sns.histplot(data[col], kde=True, color=color, bins=10, ax=ax)
    ax.set_title(title)
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Group by Age and Outcome, and count the occurrences
data_grouped = data.groupby(['Age', 'Outcome']).size().reset_index(name='Count')
fig = px.line(data_grouped, x='Age', y='Count', color='Outcome', line_shape='spline')
fig.update_layout(
    title='Distribution of Diabetes Among Age Groups',
    xaxis_title='Age',
    yaxis_title='Number of Individuals',
    legend_title='Outcome',
    legend=dict(
        orientation='v',
        yanchor="top",
        y=1.02,
        xanchor="right",
        x=1))

fig.show()

## Data Cleaning & Processing 

In [None]:
# Check for missing values
print("Missing values for each column:")
print(data.isnull().sum())

# Get a summary of the DataFrame, including the count of non-null values
print("\nDataFrame Summary:")
data.info()

In [None]:
data = data.astype('float32')

In [None]:
# drop unnecessary columns 
data = data.drop('Id', axis=1)

In [None]:
# Data Normalizationfunction 
def Norm(data): 

    normalized = (data - data.mean()) / (data.max() - data.min())
    return normalized 

In [None]:
# Convert the DataFrame to float32
data = data.astype('float32')

# Linear Regression

### 1. Data pre-processing 

In [None]:
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out

In [None]:
inputDim = 8        # takes variable 'x' 
outputDim = 1       # takes variable 'y'
learningRate = 0.001 
epochs = 1000

model = linearRegression(inputDim, outputDim)

In [None]:
# data split 
X = data.drop('Outcome', axis=1) 
y = data['Outcome']

# Splitting the dataset into 80% for training and 20% for testing
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Normalizing the training set 
x_train = Norm(x_train)
x_test = Norm(x_test)

In [None]:
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

# training 
losses = []

for epoch in range(epochs):
    # Converting inputs and labels to Variable
    if torch.cuda.is_available():
        inputs = Variable(torch.from_numpy(x_train.values).cuda())
        labels = Variable(torch.from_numpy(y_train).cuda())
    else:
        inputs = Variable(torch.from_numpy(x_train.values))
        labels = Variable(torch.from_numpy(y_train))

    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    inputs = inputs.float()
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    print(loss)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()

    print('epoch {}, loss {}'.format(epoch, loss.item()))

In [None]:
with torch.no_grad(): # we don't need gradients in the testing phase
    if torch.cuda.is_available():
        predicted = model(Variable(torch.from_numpy(x_train.values).cuda())).cpu().data.numpy()
    else:
        predicted = model(Variable(torch.from_numpy(x_train.values))).data.numpy()
    print(predicted)

plt.clf()
plt.plot(x_train, y_train, 'go', label='True data', alpha=0.5)
plt.plot(x_train, predicted, '--', label='Predictions', alpha=0.5)
plt.legend(loc='best')
plt.show()