In [1]:
import numpy as np

In [2]:
train_df = np.load('../data/preprocessed/train_data.npz')
test_df = np.load('../data/preprocessed/test_data.npz')

In [3]:
train_df['x_train'].shape
# test_df['x_test'].shape
# test_df['y_test'].shape

(89996, 50)

In [24]:
X_train = train_df['x_train']
y_train = train_df['y_train']
X_test = test_df['x_test']
y_test = test_df['y_test']


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_tr_np = X_train
y_tr_np = y_train
X_te_np = X_test
y_te_np = y_test

clf = LogisticRegression(max_iter=1000)
clf.fit(X_tr_np, y_tr_np)

y_pred = clf.predict(X_te_np)
print(classification_report(y_te_np, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.99      0.87      1338
           1       0.84      1.00      0.91       847
           2       0.12      0.03      0.04       339
           3       0.02      0.01      0.01       634
           4       0.56      0.21      0.30      1035
           5       0.34      0.70      0.45       592
           6       0.00      0.00      0.00       741
           7       0.00      0.00      0.00       421
           8       0.46      0.87      0.60      1233

    accuracy                           0.54      7180
   macro avg       0.35      0.42      0.36      7180
weighted avg       0.44      0.54      0.46      7180



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.linear = nn.Linear(input_dim, num_classes)
    
    def forward(self, x):
        return self.linear(x)
    
input_dim = X_train_tensor.shape[1] 
num_classes = len(torch.unique(y_train_tensor))
model = LogisticRegressionModel(input_dim, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

epochs = 500
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    # print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    preds = outputs.argmax(dim=1)

y_true = y_test_tensor.cpu().numpy()
y_pred = preds.cpu().numpy()
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.99      0.87      1338
           1       0.67      1.00      0.80       847
           2       0.04      0.02      0.03       339
           3       0.14      0.07      0.10       634
           4       0.76      0.35      0.48      1035
           5       0.22      0.39      0.28       592
           6       0.47      0.01      0.02       741
           7       0.00      0.00      0.00       421
           8       0.45      0.77      0.57      1233

    accuracy                           0.53      7180
   macro avg       0.39      0.40      0.35      7180
weighted avg       0.49      0.53      0.46      7180



#### Because this is the classification task, CRF does not show much effects because in general, the features are likely to be independent from each other, which is not efficient for CRF.

In [32]:
import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

SEQ_LEN = 20

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',                
    max_iterations=500,              
    all_possible_transitions=True   
)

def extract_features(vec):
    """Convert feature vector to dict for CRF."""
    return {f'f{i}': vec[i] for i in range(len(vec))}

def create_sequences(X, y, seq_len=SEQ_LEN):
    X_seq = []
    y_seq = []
    for i in range(len(X)):
        features = {f'f{j}': str(X[i][j]) for j in range(X.shape[1])}
        X_seq.append([features]) 
        y_seq.append([str(y[i])]) 
    return X_seq, y_seq

X_train = np.array(X_train.tolist())
y_train = np.array(y_train.tolist())
X_test = np.array(X_test.tolist())
y_test = np.array(y_test.tolist())

X_train_seq, y_train_seq = create_sequences(X_train, y_train)
X_test_seq, y_test_seq = create_sequences(X_test, y_test)

crf.fit(X_train_seq, y_train_seq)
y_pred_seq = crf.predict(X_test_seq)

y_true_flat = [label for seq in y_test_seq for label in seq]
y_pred_flat = [label for seq in y_pred_seq for label in seq]

print(metrics.flat_classification_report(y_true_flat, y_pred_flat, digits=4))


              precision    recall  f1-score   support

           0     0.1849    0.9522    0.3097      1338
           1     0.2105    0.0047    0.0092       847
           2     0.0500    0.0059    0.0106       339
           3     0.0612    0.0047    0.0088       634
           4     0.3103    0.0087    0.0169      1035
           5     0.1154    0.0101    0.0186       592
           6     0.0357    0.0013    0.0026       741
           7     0.1379    0.0095    0.0178       421
           8     0.1364    0.0049    0.0094      1233

    accuracy                         0.1823      7180
   macro avg     0.1380    0.1113    0.0448      7180
weighted avg     0.1565    0.1823    0.0670      7180

