In [None]:
# Task2
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt

# Step 1: Data Preprocessing
def load_and_preprocess_dataset(dataset_path, train_size, val_size, test_size, batch_size):
    # Load dataset
    df = pd.read_csv(dataset_path)

    # Handle missing values
    df.fillna(method='ffill', inplace=True)

    # One-hot encode categorical variables
    encoder = OneHotEncoder(drop='first', sparse=False)
    sex_encoded = encoder.fit_transform(df[['Sex']])
    sex_encoded = pd.DataFrame(sex_encoded, columns=['Sex_male'])

    # Normalize numerical features
    scaler = StandardScaler()
    numerical_features = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Survived', 'Sex'])
    numerical_features = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)

    # Combine encoded features with numerical features
    X = pd.concat([numerical_features, sex_encoded], axis=1)
    y = df['Survived']

    # Train-validation-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size/(train_size+val_size), random_state=42)

    # Convert to PyTorch tensors
    train_dataset = TensorDataset(torch.tensor(X_train.values).float(), torch.tensor(y_train.values).long())
    val_dataset = TensorDataset(torch.tensor(X_val.values).float(), torch.tensor(y_val.values).long())
    test_dataset = TensorDataset(torch.tensor(X_test.values).float(), torch.tensor(y_test.values).long())

    # Create DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

# Step 2: Model Architecture
class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.linear(x)
        out = self.sigmoid(out)
        return out

# Step 3: Training Process
def train(model, train_loader, val_loader, lr, n_epochs):
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    train_losses = []
    val_losses = []

    for epoch in range(n_epochs):
        model.train()
        epoch_train_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.float().view(-1, 1))
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item() * inputs.size(0)
        epoch_train_loss /= len(train_loader.dataset)
        train_losses.append(epoch_train_loss)

        model.eval()
        with torch.no_grad():
            epoch_val_loss = 0.0
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels.float().view(-1, 1))
                epoch_val_loss += loss.item() * inputs.size(0)
            epoch_val_loss /= len(val_loader.dataset)
            val_losses.append(epoch_val_loss)

        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")

    return model, train_losses, val_losses

# Step 4: Testing
def test(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = torch.round(outputs)
            y_true.extend(labels.numpy())
            y_pred.extend(predicted.numpy().flatten().astype(int))

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    print(f"Accuracy: {accuracy}, F1 Score: {f1}")
    print("Confusion Matrix:")
    print(cm)

    return accuracy, f1, cm

# Main Function
def main():
    # Step 1: Load and preprocess dataset
    train_loader, val_loader, test_loader = load_and_preprocess_dataset('/content/train.csv', train_size=0.6, val_size=0.2, test_size=0.2, batch_size=32)

    # Step 2: Initialize Network
    model = LogisticRegression(input_size=train_loader.dataset.tensors[0].shape[1])

    # Step 3: Training
    model, train_losses, val_losses = train(model, train_loader, val_loader, lr=0.01, n_epochs=100)

    # Step 4: Testing
    accuracy, f1, cm = test(model, test_loader)

if __name__ == "__main__":
    main()




Epoch 1/100, Train Loss: 0.6865, Val Loss: 0.6589
Epoch 2/100, Train Loss: 0.6698, Val Loss: 0.6451
Epoch 3/100, Train Loss: 0.6545, Val Loss: 0.6328
Epoch 4/100, Train Loss: 0.6411, Val Loss: 0.6219
Epoch 5/100, Train Loss: 0.6290, Val Loss: 0.6120
Epoch 6/100, Train Loss: 0.6181, Val Loss: 0.6032
Epoch 7/100, Train Loss: 0.6083, Val Loss: 0.5954
Epoch 8/100, Train Loss: 0.5997, Val Loss: 0.5884
Epoch 9/100, Train Loss: 0.5919, Val Loss: 0.5821
Epoch 10/100, Train Loss: 0.5848, Val Loss: 0.5764
Epoch 11/100, Train Loss: 0.5785, Val Loss: 0.5712
Epoch 12/100, Train Loss: 0.5728, Val Loss: 0.5666
Epoch 13/100, Train Loss: 0.5675, Val Loss: 0.5624
Epoch 14/100, Train Loss: 0.5629, Val Loss: 0.5586
Epoch 15/100, Train Loss: 0.5587, Val Loss: 0.5552
Epoch 16/100, Train Loss: 0.5548, Val Loss: 0.5520
Epoch 17/100, Train Loss: 0.5512, Val Loss: 0.5491
Epoch 18/100, Train Loss: 0.5481, Val Loss: 0.5465
Epoch 19/100, Train Loss: 0.5451, Val Loss: 0.5440
Epoch 20/100, Train Loss: 0.5423, Val Lo

In [None]:
# task1
import numpy as np
from sklearn.datasets import fetch_california_housing
import pickle

# Function to split the data into train, validation, and test sets
def data_split(X, Y):
    n = X.shape[0]
    train_size = int(0.7 * n)
    val_size = int(0.2 * n)
    train_X, train_Y = X[:train_size], Y[:train_size]
    val_X, val_Y = X[train_size:train_size+val_size], Y[train_size:train_size+val_size]
    test_X, test_Y = X[train_size+val_size:], Y[train_size+val_size:]
    return train_X, train_Y, val_X, val_Y, test_X, test_Y

# Function to normalize the data
def normalize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    normalized_X = (X - mean) / std
    return normalized_X, mean, std

# Function to initialize network
def linear_regression_network(input_size):
    theta = np.random.randn(input_size, 1)
    return theta

# Function for feed forward
def feed_forward(X, theta):
    y_hat = np.dot(X, theta)
    return y_hat

# Function to compute loss
def l2_loss(y_true, y_pred):
    loss = np.mean((y_pred - y_true) ** 2)
    return loss

# Function to compute gradient
def compute_gradient(X, y_true, y_pred):
    gradient = np.dot(X.T, (y_pred - y_true)) / len(X)
    return gradient

# Function for optimization (SGD)
def optimization(lr, gradient, theta):
    theta -= lr * gradient
    return theta

# Function to train the model
def train(net, train_X, train_Y, val_X, val_Y, batch_size, n_epochs, lr):
    loss_epoch_tr = []
    loss_epoch_val = []
    for epoch in range(n_epochs):
        for i in range(0, len(train_X), batch_size):
            X_batch = train_X[i:i+batch_size]
            y_batch = train_Y[i:i+batch_size]
            y_hat = feed_forward(X_batch, net)
            loss = l2_loss(y_batch, y_hat)
            gradient = compute_gradient(X_batch, y_batch, y_hat)
            net = optimization(lr, gradient, net)
        loss_epoch_tr.append(loss)
        val_y_hat = feed_forward(val_X, net)
        val_loss = l2_loss(val_Y, val_y_hat)
        loss_epoch_val.append(val_loss)
        print(f'Epoch {epoch+1}/{n_epochs}, Training Loss: {loss}, Validation Loss: {val_loss}')
    return net, loss_epoch_tr, loss_epoch_val

# Function to test the model
def test_function(model, test_X, test_Y):
    with open(model, 'rb') as f:
        trained_model = pickle.load(f)
    y_hat = feed_forward(test_X, trained_model)
    test_loss = l2_loss(test_Y, y_hat)
    print(f'Test Loss: {test_loss}')
    return test_loss

# Main function
def main():
    # Load dataset
    dataset = fetch_california_housing()
    X = dataset.data
    Y = dataset.target[:, np.newaxis]

    # Split data
    train_X, train_Y, val_X, val_Y, test_X, test_Y = data_split(X, Y)

    # Normalize data
    train_X, mean, std = normalize(train_X)
    val_X = (val_X - mean) / std
    test_X = (test_X - mean) / std

    # Initialize network
    net = linear_regression_network(X.shape[1])

    # Train the model
    batch_size = 32
    n_epochs = 100
    lr = 0.01
    trained_model, loss_epoch_tr, loss_epoch_val = train(net, train_X, train_Y, val_X, val_Y, batch_size, n_epochs, lr)

    # Save trained model
    with open('model.pkl', 'wb') as f:
        pickle.dump(trained_model, f)

    # Test the model
    test_function('model.pkl', test_X, test_Y)

if __name__ == "__main__":
    main()


Epoch 1/100, Training Loss: 3.8035481100895496, Validation Loss: 9.561462200661463
Epoch 2/100, Training Loss: 3.502288133637066, Validation Loss: 9.254675189607683
Epoch 3/100, Training Loss: 3.820291178034654, Validation Loss: 9.230700057299183
Epoch 4/100, Training Loss: 3.455070581197364, Validation Loss: 9.078773220867811
Epoch 5/100, Training Loss: 3.7709056634994442, Validation Loss: 9.101452901952376
Epoch 6/100, Training Loss: 3.42849988836921, Validation Loss: 8.980421945428384
Epoch 7/100, Training Loss: 3.7420963365307944, Validation Loss: 9.024218500266347
Epoch 8/100, Training Loss: 3.416010235766616, Validation Loss: 8.920644923161374
Epoch 9/100, Training Loss: 3.724402618729343, Validation Loss: 8.975316720778393
Epoch 10/100, Training Loss: 3.410959481752047, Validation Loss: 8.882368018084586
Epoch 11/100, Training Loss: 3.712701931369354, Validation Loss: 8.9424311315647
Epoch 12/100, Training Loss: 3.4098680095217806, Validation Loss: 8.856432387894772
Epoch 13/100