In [None]:
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F

warnings.filterwarnings('ignore')

In [None]:
train_data_path = 'train.csv'
df = pd.read_csv(train_data_path)

In [None]:
# Check for columns that are entirely NaN
all_nan_columns = df.columns[df.isnull().all()].tolist()

if all_nan_columns:
    print("Columns that are entirely NaN:")
    for col in all_nan_columns:
        print(f"- {col}")
else:
    print("No columns are entirely NaN.")

nan_percentage = df.isnull().mean() * 100

print("\nPercentage of NaN values in each column:")
print(nan_percentage)

high_nan_columns = nan_percentage[nan_percentage > 50].sort_values(ascending=False)

if not high_nan_columns.empty:
    print("\nColumns with more than 90% NaN values:")
    print(high_nan_columns)
else:
    print("\nNo columns have more than 90% NaN values.")
drop_cols = high_nan_columns.index.tolist()
df = df.drop(columns=drop_cols)

In [None]:
LABEL = 'class'
label_2_class = {
    'e':1,
    'p':0
}

df['label'] = df[LABEL].apply(lambda x: label_2_class[x])
df_filled = df.fillna('Missing')

In [None]:
label_encoder = LabelEncoder()
features_encoding = [f for f in df_filled if f not in ['label','id','class','cap-diameter','stem-height','stem-width']]
for f in features_encoding:
    df_filled[f] = label_encoder.fit_transform(df_filled[f])
df_feats = df_filled[features_encoding+['cap-diameter','stem-height','stem-width','label']]
df_feats = df_filled[features_encoding+['cap-diameter','stem-height','stem-width','label']]

standard_scaler = StandardScaler()
df_feats[features_encoding] = standard_scaler.fit_transform(df_feats[features_encoding])

In [None]:
label = 'label'
features = [f for f in df_feats if f not in [label]]
print(features,label)

In [None]:
df_feats = df_feats[df_feats['cap-diameter'] != 'Missing']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_feats[features],df_feats[label],test_size=0.2,random_state=42)

In [None]:
class MyDataset(Dataset):
    def __init__(self,X,y):
        super(MyDataset,self).__init__()
        self.X = torch.tensor(X.astype(float),dtype=torch.float32)
        self.y = torch.tensor(y,dtype=torch.long)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self,idx):
        return self.X[idx] ,self.y[idx]

X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy()

train_data = MyDataset(X_train_np,y_train_np)
test_data = MyDataset(X_test_np,y_test_np)
train_loader = DataLoader(train_data,batch_size=32,shuffle=True)
test_loader = DataLoader(test_data,batch_size=32,shuffle=False)
data_iter = iter(train_loader)
example_data,example_label = next(data_iter)
example_data.shape

In [None]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet,self).__init__()
        self.fc1 = nn.Linear(20,512)
        self.fc2 = nn.Linear(512,64)
        self.fc3 = nn.Linear(64,2)
        
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = MyNet()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()
epochs = 30
for epoch in range(epochs):
    total_loss = 0
    num_data = 0
    total_correct = 0
    for data,label in train_loader:
        output = model(data)
        loss = criterion(output,label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.items() * label.shape[0]
        num_data += label.shape[0]
        total_correct += sum(output.argmax(dim=1) == label)
    print(f'Epoch:{epoch+1},Loss:{total_loss/num_data},Acc:{total_correct/num_data}')
    print('\n--------------- validation ---------------\n')
    val_loss = 0
    num_val = 0
    val_correct = 0
    for val_data,val_label in test_loader:
        val_output = model(val_data)
        loss = criterion(val_output,val_label)
        val_loss += loss.items() * val_label.shape[0]
        num_val += val_label.shape[0]
        val_correct += sum(val_output.argmax(dim=1) == val_label)
    print(f'Epoch:{epoch+1},Loss:{val_loss/num_val},Acc:{val_correct/num_val}')