In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


# Load data
train_data = pd.read_csv('../data/Train.csv')
val_data = pd.read_csv('../data/Validate.csv')
test_data = pd.read_csv('../data/Test.csv')

# ID,Married,Gender,Age,Graduate,Profession,Years_of_Working ,Spending_Score,Family_Members,Category,Class(Target)
# ID : int
# Married : 0/1 NAN -> -1
# Gender : 0/1 NAN -> -1
# Age : int NAN -> -1
# Graduate : 0/1 NAN -> -1
# Profession : string -> LabelEncoder NAN -> "Unknown"
# Years_of_Working : int NAN -> -1
# Spending_Score : string -> LabelEncoder NAN -> "Unknown"
# Family_Members : int NAN -> -1
# Category : string -> LabelEncoder NAN -> "Unknown"
# Class(Target): string -> LabelEncoder NAN -> "Unknown"

# Preprocess data
def preprocess_data(data):
    # Fill missing values for string columns with the mode (most frequent value)
    data['Married'] = data['Married'].fillna(data['Married'].mode()[0])  # Most frequent value
    data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])  # Most frequent value
    data['Profession'] = data['Profession'].fillna(data['Profession'].mode()[0])  # Most frequent value
    data['Spending_Score'] = data['Spending_Score'].fillna(data['Spending_Score'].mode()[0])  # Most frequent value
    data['Category'] = data['Category'].fillna(data['Category'].mode()[0])  # Most frequent value

    # Fill missing values for numerical columns with the mean and round to nearest integer
    data['Age'] = data['Age'].fillna(data['Age'].mean()).round().astype(int)  # Mean and rounding
    data['Years_of_Working '] = data['Years_of_Working '].fillna(data['Years_of_Working '].mean()).round().astype(int)  # Mean and rounding
    data['Family_Members'] = data['Family_Members'].fillna(data['Family_Members'].mean()).round().astype(int)  # Mean and rounding

    # Fill missing values for binary columns (0/1) with the most frequent value
    data['Graduate'] = data['Graduate'].fillna(data['Graduate'].mode()[0])  # Most frequent value
    data['Married'] = data['Married'].fillna(data['Married'].mode()[0])  # Most frequent value

    # Label encoding for string columns
    le = LabelEncoder()
    data['Married'] = le.fit_transform(data['Married'])
    data['Gender'] = le.fit_transform(data['Gender'])
    data['Graduate'] = le.fit_transform(data['Graduate'])
    data['Profession'] = le.fit_transform(data['Profession'])
    data['Spending_Score'] = le.fit_transform(data['Spending_Score'])
    data['Category'] = le.fit_transform(data['Category'])
    data['Class(Target)'] = le.fit_transform(data['Class(Target)'])
    return data

train_data = preprocess_data(train_data)
val_data = preprocess_data(val_data)
test_data = preprocess_data(test_data)

# Split data
X_train = train_data.drop(['ID','Class(Target)'],axis=1)
y_train = train_data['Class(Target)']
X_val = val_data.drop(['ID','Class(Target)'],axis=1)
y_val = val_data['Class(Target)']
X_test = test_data.drop(['ID','Class(Target)'],axis=1)
y_test = test_data['Class(Target)']

# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(X_train,X_train.shape)
print(y_train,y_train.shape)
print(X_val,X_val.shape)
print(y_val,y_val.shape)
print(X_test,X_test.shape)
print(y_test,y_test.shape)

[[ 0.90791946  1.19816059 -0.4430708  ...  0.74762549 -0.56061626
   0.60143067]
 [-1.10141929  1.19816059  0.33542047 ...  0.74762549 -1.22802506
   0.60143067]
 [-1.10141929 -0.83461266 -1.10179418 ...  0.74762549 -1.22802506
  -1.5265993 ]
 ...
 [-1.10141929 -0.83461266  1.35344752 ... -1.6101976  -0.56061626
   0.60143067]
 [ 0.90791946 -0.83461266  2.19182273 ...  0.74762549 -0.56061626
   0.60143067]
 [-1.10141929  1.19816059 -1.28144601 ...  0.74762549  0.77420134
   0.60143067]] (7487, 9)
0       2
1       0
2       1
3       0
4       0
       ..
7482    1
7483    3
7484    3
7485    3
7486    1
Name: Class(Target), Length: 7487, dtype: int64 (7487,)
[[-1.10141929 -0.83461266 -0.4430708  ...  0.74762549 -0.56061626
   0.60143067]
 [ 0.90791946 -0.83461266 -0.02388319 ... -0.43128606  0.77420134
   0.60143067]
 [-1.10141929 -0.83461266  0.57495625 ... -1.6101976   0.77420134
   0.60143067]
 ...
 [-1.10141929  1.19816059 -1.28144601 ...  0.74762549  1.44161014
   0.60143067]
 [ 

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f"{model_name} Accuracy: {accuracy_score(y_val, y_pred)}")


Logistic Regression Accuracy: 0.4462114125350795
Support Vector Classifier Accuracy: 0.47988774555659497
Random Forest Accuracy: 0.41253507951356405
Gradient Boosting Accuracy: 0.4761459307764266
