In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [42]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = 'training_data.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Encode the target variable as binary
data['increase_stock_binary'] = data['increase_stock'].apply(lambda x: 1 if x == 'high_bike_demand' else 0)

# Drop the original target variable and separate features and the new target variable
X = data.drop(columns=['increase_stock', 'increase_stock_binary'])
y = data['increase_stock_binary']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=0)

# Convert the scaled training and test data to PyTorch tensors
X_train_tensor = torch.Tensor(X_train)
X_test_tensor = torch.Tensor(X_test)

# Create DataLoader for both training and test set
batch_size = 128
train_dataset = TensorDataset(X_train_tensor, X_train_tensor)  # Autoencoder outputs are the same as inputs
test_dataset = TensorDataset(X_test_tensor, X_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the Autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, n_features):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(n_features, 256),
            nn.ReLU(True),
            nn.Linear(256, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, 12)  # compress to 3 features which is the size of our encoded representations
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(12, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Linear(256, n_features),
            nn.Sigmoid()  # since the input is scaled between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Initialize the network
n_features = X_train_tensor.shape[1]
autoencoder = Autoencoder(n_features)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

# Training loop
n_epochs = 100

for epoch in range(n_epochs):
    autoencoder.train()
    train_loss = 0.0
    for data in train_loader:
        inputs, _ = data  # No target variable needed for autoencoder
        optimizer.zero_grad()
        outputs = autoencoder(inputs)
        loss = criterion(outputs, inputs)  # Compare outputs with inputs
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*inputs.size(0)
    train_loss = train_loss / len(train_loader.dataset)
    
    # Print out the loss periodically
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/{n_epochs} \t Training Loss: {train_loss:.4f}')

# Encoding the training set to get the compressed representation
autoencoder.eval()  # Evaluation mode
X_train_encoded = autoencoder.encoder(X_train_tensor).detach().numpy()
X_test_encoded = autoencoder.encoder(X_test_tensor).detach().numpy()

print(X_train_encoded.shape, X_test_encoded.shape)

Epoch 10/100 	 Training Loss: 0.9090
Epoch 20/100 	 Training Loss: 0.8169
Epoch 30/100 	 Training Loss: 0.7556
Epoch 40/100 	 Training Loss: 0.7348
Epoch 50/100 	 Training Loss: 0.7163
Epoch 60/100 	 Training Loss: 0.7038
Epoch 70/100 	 Training Loss: 0.6982
Epoch 80/100 	 Training Loss: 0.6941
Epoch 90/100 	 Training Loss: 0.6921
Epoch 100/100 	 Training Loss: 0.6832
(1200, 12) (400, 12)


In [43]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': range(1,31),  # Different values for n_neighbors
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize KNN classifier
knn = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy')

# Fit the grid search to the encoded data
grid_search.fit(X_train_encoded, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print('Best Parameters:', best_params)
print('Best Score:', best_score)


Best Parameters: {'metric': 'euclidean', 'n_neighbors': 14, 'weights': 'distance'}
Best Score: 0.8641666666666665
