In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the training dataset
train_data = pd.read_csv("fashion_train.csv")

# Load the test dataset
test_data = pd.read_csv("fashion_test.csv")

# Display dataset structure
print(train_data.head())
print(test_data.head())

   label  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0      0       0       0       0       1       2       0       0       0   
1      1       0       0       0       0       0       0       0       0   
2      0       0       0       0       0       1       0       0       0   
3      1       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel9  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
0       0  ...         3         0         0         0         0         1   
1       0  ...       203       214       166         0         0         0   
2       0  ...       164       177       163         0         0         1   
3       0  ...         9        10         9         9         8         1   
4       0  ...         0         0         0         0         0         0   

   pixel781  pixel782  pixel783  pixel784  
0         0         0         

In [2]:
# Extract features (X) and labels (y)
X = train_data.drop(columns=["label"]).values / 255.0  # Normalize
y = train_data["label"].values  # Target labels

# Split training set into Training (80%) & Validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize test set
X_test = test_data.values / 255.0

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")


Training set shape: (9600, 784)
Validation set shape: (2400, 784)
Test set shape: (2000, 784)


In [4]:
class LogisticRegression:
    def __init__(self, input_dim, learning_rate=0.01):
        self.W = np.random.randn(input_dim) * 0.01  # Small random weights
        self.b = 0  # Bias initialized to 0
        self.learning_rate = learning_rate

    def sigmoid(self, z):
        """Sigmoid activation function."""
        return 1 / (1 + np.exp(-z))

    def predict_proba(self, X):
        """Compute probability predictions."""
        z = np.dot(X, self.W) + self.b
        return self.sigmoid(z)

    def predict(self, X):
        """Classify into 0 or 1 based on probability threshold 0.5."""
        return (self.predict_proba(X) >= 0.5).astype(int)

    def compute_loss(self, y_true, y_pred):
        """Binary Cross-Entropy Loss."""
        m = y_true.shape[0]
        loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        return loss

    def train(self, X_train, y_train, X_val, y_val, epochs=1000):
        """Train model using Stochastic Gradient Descent (SGD)."""
        m = X_train.shape[0]
        for epoch in range(epochs):
            y_pred_train = self.predict_proba(X_train)
            y_pred_val = self.predict_proba(X_val)

            train_loss = self.compute_loss(y_train, y_pred_train)
            val_loss = self.compute_loss(y_val, y_pred_val)

            # Compute gradients
            dW = np.dot(X_train.T, (y_pred_train - y_train)) / m
            db = np.mean(y_pred_train - y_train)

            # Update weights and bias
            self.W -= self.learning_rate * dW
            self.b -= self.learning_rate * db

            # Print loss every 100 epochs
            if epoch % 100 == 0:
                print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Validation Loss = {val_loss:.4f}")


In [5]:
# Initialize and train model
model = LogisticRegression(input_dim=X_train.shape[1], learning_rate=0.01)
model.train(X_train, y_train, X_val, y_val, epochs=1000)


Epoch 0: Train Loss = 0.7030, Validation Loss = 0.7029
Epoch 100: Train Loss = 0.1648, Validation Loss = 0.1629
Epoch 200: Train Loss = 0.1283, Validation Loss = 0.1258
Epoch 300: Train Loss = 0.1142, Validation Loss = 0.1113
Epoch 400: Train Loss = 0.1065, Validation Loss = 0.1034
Epoch 500: Train Loss = 0.1016, Validation Loss = 0.0984
Epoch 600: Train Loss = 0.0982, Validation Loss = 0.0948
Epoch 700: Train Loss = 0.0956, Validation Loss = 0.0922
Epoch 800: Train Loss = 0.0936, Validation Loss = 0.0901
Epoch 900: Train Loss = 0.0920, Validation Loss = 0.0885


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on validation set
y_pred_val = model.predict(X_val)

# Compute evaluation metrics
accuracy = accuracy_score(y_val, y_pred_val)
precision = precision_score(y_val, y_pred_val)
recall = recall_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Accuracy: 0.9742
Precision: 0.9719
Recall: 0.9768
F1-score: 0.9743


In [7]:
# Predict on test set
y_test_pred_proba = model.predict_proba(X_test)

# Save submission file
submission_df = pd.DataFrame({
    "id": np.arange(len(y_test_pred_proba)),
    "prediction": y_test_pred_proba  # Use raw probability values
})

submission_df.to_csv("submission_baseline.csv", index=False)
print("Kaggle submission file saved as 'submission_baseline.csv'.")


Kaggle submission file saved as 'submission_baseline.csv'.
