In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('myopia.csv', sep = ';')

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, x, layer_config=[10, 10]):
        super(Model, self).__init__()
        layers = []
        input_size = x
        for hidden_neurons in layer_config:
            # Add a linear layer
            layers.append(nn.Linear(input_size, hidden_neurons))
            # Add a ReLU activation
            layers.append(nn.ReLU())
            input_size = hidden_neurons
        layers.append(nn.Softmax())
        
        # Combine all layers into a sequential module
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

# Example usage
model0 = Model(x=10, layer_config=[20, 20])
print(model0)

Model(
  (model): Sequential(
    (0): Linear(in_features=10, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=20, bias=True)
    (3): ReLU()
    (4): Softmax(dim=None)
  )
)


In [5]:
from ctgan import CTGAN
import pandas as pd

# Assuming 'data' is a Pandas DataFrame containing your full dataset
# Specify which columns are categorical/binary if needed
discrete_columns = ['AGE', 'GENDER', 'SPORTHR', 'READHR', 'COMPHR', 'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY']

ctgan = CTGAN(epochs=10)  # Increase epochs for better quality
ctgan.fit(df, discrete_columns=discrete_columns)

# Generate 1000 new synthetic samples
synthetic_data = ctgan.sample(300)

# Combine synthetic data with your original dataset for training
df = pd.concat([df, synthetic_data], ignore_index=True)


In [6]:

from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming your DataFrame is `df`
# Step 1: Split into training (75%) and remaining (25%)
datatrain, temp_data = train_test_split(df, test_size=0.25, stratify=df['MYOPIC'], random_state=2)

# Step 2: Split remaining data (25%) into validation (60% of remaining) and test (40% of remaining)
dataval, datatest = train_test_split(temp_data, test_size=0.4, stratify=temp_data['MYOPIC'], random_state=21)

x_train = datatrain[['AGE', 'GENDER', 'SPORTHR', 'READHR', 'COMPHR', 'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY']]
x_val = dataval[['AGE', 'GENDER', 'SPORTHR', 'READHR', 'COMPHR', 'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY']]
x_test = datatest[['AGE', 'GENDER', 'SPORTHR', 'READHR', 'COMPHR', 'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY']]
y_train = datatrain['MYOPIC']
y_val = dataval['MYOPIC']
y_test = datatest['MYOPIC']

# Confirm the sizes
print(f"Train Data Size: {len(datatrain)} ({len(datatrain)/len(df)*100:.2f}%)")
print(f"Validation Data Size: {len(dataval)} ({len(dataval)/len(df)*100:.2f}%)")
print(f"Test Data Size: {len(datatest)} ({len(datatest)/len(df)*100:.2f}%)")


Train Data Size: 688 (74.95%)
Validation Data Size: 138 (15.03%)
Test Data Size: 92 (10.02%)


In [7]:
# 4. Hyperparameters
lr = 0.1   # Reduced learning rate for stability
epochs = 50  # Increased epochs for better training
batch_size = 64
layers = [500, 500, 500, 500]
optimizer = torch.optim.AdamW(model0.parameters(), lr=lr)
dropout_factor = 0

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import torch.nn.init as init
from tqdm import tqdm

# Check if CUDA is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# 1. Custom Dataset definition
#    Labels are float in [0,1].
class DataTest(Dataset):
    def __init__(self, x_data, y_data):
        self.x = torch.tensor(x_data.values, dtype=torch.float32)
        self.y = torch.tensor(y_data.values, dtype=torch.float32)
        self.n_samples = len(x_data)
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

# 2. Model definition with random weight initialization
class Model(nn.Module):
    def __init__(self, input_neurons, layer_config=layers, num_classes=2):
        super(Model, self).__init__()
        layers = []
        input_size = input_neurons
        for hidden_neurons in layer_config:
            layers.append(nn.Linear(input_size, hidden_neurons))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_factor))  # Dropout regularization
            input_size = hidden_neurons
        layers.append(nn.Linear(input_size, num_classes))
        self.model = nn.Sequential(*layers)
        
        # Initialize weights
        self._initialize_weights()

    def forward(self, x):
        return self.model(x)

    def _initialize_weights(self):
        for layer in self.model:
            if isinstance(layer, nn.Linear):
                # Apply He (Kaiming) initialization - Normal distribution
                init.kaiming_normal_(layer.weight, nonlinearity='relu')
                
                # Initialize bias to zeros (optional)
                if layer.bias is not None:
                    init.constant_(layer.bias, 0)

# 3. Numerically stable soft-label cross-entropy loss using log_softmax
def soft_label_crossentropy(logits, label):
    """
    logits: shape (batch_size, 2)
    label: shape (batch_size,) with float values in [0,1]
    """
    # Construct target distribution: [1 - y, y]
    target_dist = torch.stack([1 - label, label], dim=1)  # shape [batch_size, 2]
    # Compute log probabilities in a stable way
    log_prob = F.log_softmax(logits, dim=1)
    # Cross-entropy = -sum(target * log_prob) averaged over batch
    loss_per_sample = -(target_dist * log_prob).sum(dim=1)
    return loss_per_sample.mean()

try:
    train_dataset = DataTest(x_train, y_train)
    val_dataset = DataTest(x_val, y_val)
    test_dataset = DataTest(x_test, y_test)
except NameError:
    raise NameError("Please define x_train, y_train, x_val, y_val, and x_test, y_test before running the script.")

# 6. Create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# 7. Initialize model and optimizer
input_features = x_train.shape[1]
model0 = Model(input_neurons=input_features, layer_config=layers, num_classes=2).to(device)
optimizer = torch.optim.Adam(model0.parameters(), lr=lr)

# 8. Training and validation loop
for epoch in range(epochs):
    # ---- Training Phase ----
    model0.train()
    total_train_loss = 0.0
    total_train_correct = 0
    total_train_samples = 0
    train_softmax_outputs = []
    train_diff_sum = 0.0  # Sum of absolute differences

    for data, label in train_loader:
        data = data.to(device)
        label = label.to(device)

        # Forward pass: get logits
        logits = model0(data)                # shape = (batch_size, 2)
        
        # Compute loss using logits directly
        loss = soft_label_crossentropy(logits, label)

        # Backward + Optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        # Compute probabilities for accuracy and analysis
        prob = F.softmax(logits, dim=1)
        preds = (prob[:, 1] >= 0.5).float()    # threshold at 0.5 for binary decision
        total_train_correct += (preds == label).sum().item()
        total_train_samples += label.size(0)
        train_softmax_outputs.extend(prob[:, 1].detach().cpu().tolist())
        
        # Compute absolute difference between label and predicted probability
        train_diff_sum += torch.abs(prob[:, 1] - label).sum().item()

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = (total_train_correct / total_train_samples) * 100
    avg_train_softmax = np.mean(train_softmax_outputs)
    avg_train_diff = train_diff_sum / total_train_samples

    # ---- Validation Phase ----
    model0.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    total_val_samples = 0
    val_softmax_outputs = []
    val_diff_sum = 0.0  # Sum of absolute differences

    with torch.no_grad():
        for data, label in val_loader:
            data = data.to(device)
            label = label.to(device)

            logits = model0(data)
            loss = soft_label_crossentropy(logits, label)
            total_val_loss += loss.item()

            prob = F.softmax(logits, dim=1)
            preds = (prob[:, 1] >= 0.5).float()
            total_val_correct += (preds == label).sum().item()
            total_val_samples += label.size(0)
            val_softmax_outputs.extend(prob[:, 1].cpu().tolist())
            
            # Compute absolute difference between label and predicted probability
            val_diff_sum += torch.abs(prob[:, 1] - label).sum().item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = (total_val_correct / total_val_samples) * 100
    avg_val_softmax = np.mean(val_softmax_outputs)
    avg_val_diff = val_diff_sum / total_val_samples

    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train Loss: {avg_train_loss:.5f}, Train Acc: {train_accuracy:.2f}%, "
          f"Val Loss: {avg_val_loss:.5f}, Val Acc: {val_accuracy:.2f}%, "
          f"Avg Softmax (Train): {avg_train_softmax:.4f}, Avg Diff (Train): {avg_train_diff:.4f}, "
          f"Avg Softmax (Val): {avg_val_softmax:.4f}, Avg Diff (Val): {avg_val_diff:.4f}", end="\r")

# ---- Testing Phase ----
model0.eval()
total_test_loss = 0.0
total_test_correct = 0
total_test_samples = 0
test_softmax_outputs = []
test_diff_sum = 0.0

with torch.no_grad():
    for data, label in test_loader:
        data = data.to(device)
        label = label.to(device)

        logits = model0(data)
        loss = soft_label_crossentropy(logits, label)
        total_test_loss += loss.item()

        prob = F.softmax(logits, dim=1)
        preds = (prob[:, 1] >= 0.5).float()
        total_test_correct += (preds == label).sum().item()
        total_test_samples += label.size(0)
        test_softmax_outputs.extend(prob[:, 1].cpu().tolist())
        test_diff_sum += torch.abs(prob[:, 1] - label).sum().item()

avg_test_loss = total_test_loss / len(test_loader)
test_accuracy = (total_test_correct / total_test_samples) * 100
avg_test_softmax = np.mean(test_softmax_outputs)
avg_test_diff = test_diff_sum / total_test_samples

print(f"\nTest Loss: {avg_test_loss:.5f}, Test Acc: {test_accuracy:.2f}%, "
      f"Avg Softmax (Test): {avg_test_softmax:.4f}, Avg Diff (Test): {avg_test_diff:.4f}")


Using device: cpu
Epoch 50/50 | Train Loss: 0.49270, Train Acc: 80.67%, Val Loss: 0.49414, Val Acc: 81.16%, Avg Softmax (Train): 0.1862, Avg Diff (Train): 0.3077, Avg Softmax (Val): 0.2458, Avg Diff (Val): 0.33968.1884
Test Loss: 0.44292, Test Acc: 80.43%, Avg Softmax (Test): 0.2513, Avg Diff (Test): 0.3486


In [9]:
# Assuming x_test and y_test are pandas DataFrame and Series, respectively.
# Convert them into a dataset and DataLoader for evaluation.

# 1. Create a test dataset and data loader
test_dataset = DataTest(x_test, y_test)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# 2. Evaluation on the test set
model0.eval()  # Set the model to evaluation mode
total_test_correct = 0
total_test_samples = 0
test_softmax_outputs = []  # To collect softmax outputs for the test set

with torch.no_grad():  # No gradient computation for evaluation
    for data, label in test_loader:
        data = data.to(torch.float32)
        label = label.to(torch.long)

        outputs = model0(data)

        # Apply softmax to get probabilities and collect
        softmax_outputs = F.softmax(outputs, dim=1).tolist()
        test_softmax_outputs.extend(softmax_outputs)

        # Predictions and accuracy
        preds = torch.argmax(outputs, dim=1)
        total_test_correct += (preds == label).sum().item()
        total_test_samples += label.size(0)

# Calculate test accuracy
test_accuracy = (total_test_correct / total_test_samples) * 100
avg_test_softmax = torch.tensor(test_softmax_outputs).mean(dim=0)  # Average softmax outputs

# Print test set results
print(f"Test Accuracy: {test_accuracy:.2f}%")
print(f"Avg Softmax (Test): {avg_test_softmax.tolist()}")


Test Accuracy: 80.43%
Avg Softmax (Test): [0.7487205862998962, 0.2512793242931366]


In [10]:
import torch

# Save the model state dictionary (recommended)
torch.save(model0.state_dict(), 'Model0.pth')


In [17]:
df

Unnamed: 0,ID,STUDYYEAR,MYOPIC,AGE,GENDER,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,1,1992,1,6,1,-0.052000,21.890000,3.690000,3.498000,14.700000,45,8,0,0,10,34,1,1
1,2,1995,0,6,1,0.608000,22.380000,3.702000,3.392000,15.290000,4,0,1,1,7,12,1,1
2,3,1991,0,6,1,1.179000,22.490000,3.462000,3.514000,15.520000,14,0,2,0,10,14,0,0
3,4,1990,1,6,1,0.525000,22.200000,3.862000,3.612000,14.730000,18,11,0,0,4,37,0,1
4,5,1995,0,5,0,0.697000,23.290000,3.676000,3.454000,16.160000,14,0,0,0,4,4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,617,1990,0,5,1,0.915113,22.068130,3.135561,3.568084,14.190430,14,12,10,4,9,15,0,0
914,337,1993,0,5,1,0.909335,23.192723,2.682925,3.408517,15.074215,29,8,2,12,23,2,0,0
915,242,1990,0,7,0,0.677650,22.328070,3.375869,3.503506,17.208199,7,7,11,4,22,9,0,1
916,459,1994,0,8,0,0.893230,23.379400,3.526105,3.927830,15.267251,8,15,12,12,25,11,0,0


In [11]:
import requests

# Backend API URL
url = "http://127.0.0.1:5000/survey"  # Replace 'endpoint' with your actual route

try:
    # Make a GET request
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        datai = response.json()
        print("Data fetched successfully:", datai)
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        print("Response:", response.text)
except Exception as e:
    print("Error occurred:", e)


Data fetched successfully: {'survey': [{'age': 19, 'gender': 'Male', 'id': 1, 'name': 'Y Niteesh Reddy', 'time_reading': 44, 'time_sports': 23, 'time_studying': 86, 'time_tv': 5123456789, 'time_video_games': 323}]}


In [16]:
datai

{'survey': [{'age': 19,
   'gender': 'Male',
   'id': 1,
   'name': 'Y Niteesh Reddy',
   'time_reading': 44,
   'time_sports': 23,
   'time_studying': 86,
   'time_tv': 5123456789,
   'time_video_games': 323}]}