# Task 1: Baseline Model

In this notebook we develop and evaluate a baseline model for classifying sentences of the PubMed RCT dataset. First we test different models and parameters of the tf-idf embedding. Then for our best performing model we test if balancing the classes and if .. 

In [None]:
import pandas as pd
import numpy as np
import project2Lib 
from pathlib import Path

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

#for model-building
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras import activations
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf

from scipy.sparse import hstack, csr_matrix

In [None]:
def print_nicely(n, df):
    
    for index, row in df.iterrows():
        print("----------------------")
        r = []
        for i in range(len(df.columns)):
            r.append(row[i])
        print(r)
        
        if(index >=n):
            break
def read_preprocessed_data(option):
    datasets = ["train", "dev", "test"]
    filepaths = []
    for i in datasets: 
        filepaths.append(f"./PreprocessedData/{i}_{option}.csv")

    train_data = pd.read_csv(filepaths[0], index_col = 0)
    train_data = train_data.fillna('')
    dev_data = pd.read_csv(filepaths[1], index_col = 0)
    dev_data = dev_data.fillna('')
    test_data = pd.read_csv(filepaths[2], index_col = 0)
    test_data = test_data.fillna('')
    
    return train_data, dev_data, test_data

## Evaluate different models

First, we test different (simple) models directly on tfidf without any further preprocessing. We optimize some parameters regarding preprocessing. More specifically, we vary tfidf options (varying number of maximal features and ngram_range),  balancing the classes vs not balancing and using the relative line number of a sentence or not.

In [None]:
parameters = {
    "preprocessed_options" : ["lemmatization_noph.csv", "lemmatization.csv"],
    "max_features": [5000,15000,30000,50000, None], 
    "ngram_range" : [(1,1), (1,2)],
    "classifier": [MultinomialNB(), LogisticRegression(random_state=321, max_iter = 500), RandomForestClassifier(max_depth=40, random_state=123)]
}

results = {}

for preprocessed_option in parameters['preprocessed_options']:
    # read preprocessed dataset
    datasets = ["train", "dev", "test"]
    filepaths = []
    for i in datasets: 
        filepaths.append(f"./PreprocessedData/{i}_{preprocessed_option}")
    
    train_data = pd.read_csv(filepaths[0], index_col = 0)
    train_data = train_data.fillna('')
    dev_data = pd.read_csv(filepaths[1], index_col = 0)
    dev_data = dev_data.fillna('')
    test_data = pd.read_csv(filepaths[2], index_col = 0)
    test_data = test_data.fillna('')

    Y_train = train_data["label"].to_numpy()
    Y_dev = dev_data["label"].to_numpy()
    Y_test = test_data["label"].to_numpy()
    X_train = train_data["sentence"].values
    X_dev = dev_data["sentence"].values
    X_test = test_data["sentence"].values
    
    
    for max_feature in parameters["max_features"]:
        for ngram in parameters["ngram_range"]:

            tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=max_feature, ngram_range=ngram)
            X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
            X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)

            for classifier in parameters["classifier"]:
                clf = classifier
                clf.fit(X_train_vectors_tfidf, Y_train)
                y_hat_dev = clf.predict(X_dev_vectors_tfidf)
                f1 = f1_score(Y_dev, y_hat_dev,average="weighted")

                key = f"{preprocessed_option}_max_feature{max_feature}_ngram{ngram}_" + str(classifier)

                results[key] = f1
                print(f"{key}____{f1}")

In [None]:
# read preprocessed data
datasets = ["train", "dev", "test"]
filepaths = []
for i in datasets: 
    filepaths.append(f"./PreprocessedData/{i}_lemmatization_noph.csv")
    
train_data = pd.read_csv(filepaths[0], index_col = 0)
train_data = train_data.fillna('')
dev_data = pd.read_csv(filepaths[1], index_col = 0)
dev_data = dev_data.fillna('')
test_data = pd.read_csv(filepaths[2], index_col = 0)
test_data = test_data.fillna('')

Y_train = train_data["label"].to_numpy()
Y_dev = dev_data["label"].to_numpy()
Y_test = test_data["label"].to_numpy()
X_train = train_data["preprocess"].values
X_dev = dev_data["preprocess"].values
X_test = test_data["preprocess"].values


# train tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=50000, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)

clf = LogisticRegression(random_state=321, max_iter = 500)
clf.fit(X_train_vectors_tfidf, Y_train)
y_hat_dev = clf.predict(X_dev_vectors_tfidf)
f1 = f1_score(Y_dev, y_hat_dev,average="weighted")
print(f"F1 score: {f1}")
cm = confusion_matrix(Y_dev, y_hat_dev)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

Based on this grid-search we conclude that **logistic regression** is the best performing model among the tested model. 

The differences between removing the placeholder @ for numbers and keeping it is very small with the preprocessed data without the placeholder performing slightly better. Using a tf-idf with all features and 1-grams and 2-grams performs best and has (weigthed) **f1-score = 0.80288** when evaluating the trained model on the dev-dataset. In the corresponding tf-idf embedding, there are 5289618 features. It is worth mentioning that, using logistic regression and 5000 featrues obtains an **f1-score = 0.79512**. Because of the small performance gap, but much larger complexity gap in the data, we continue working with the max_features=5000 option.


-> F1 score keep ph: 0.7958474671704888

### Complex Model: XGBoost
Additionally, we further test XGBoost on a smaller subset of tfidf features with max_features = 300 (due to computational limits). However they perform worse than simple models (f1 = 0.6471) and model training takes significantly more time. Therefore we focus on logistic regression for the baseline model. The code for reproducing the results is in the following cell. 

In [None]:
from xgboost import XGBClassifier

tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=300, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

model = XGBClassifier(max_depth=10,random_state=42, n_estimators = 100)
model.fit(X_train_vectors_tfidf, Y_train)

y_hat_dev = model.predict(X_dev_vectors_tfidf)
f1 = f1_score(Y_dev, y_hat_dev,average="weighted")
print(f"F1 score: {f1}")

### Solving class imbalance

The training data is inbalanced. The largest class "RESULTS" has 786,527 observations while the smallest class "OBJECTIVE" has only 191,408 observations. Therefore we try over- and undersampling to balance the data and compare the performance compared to the original data.

In [None]:
# read preprocessed data 
train_data, dev_data, test_data = read_preprocessed_data("lemmatization_noph")

# resample training data so classes are more balanced
class_mapping={
    "0" :(350000,True),
    "1" :(350000,True),
    "2" :(550000,False),
    "3" :(550000,False),
    "4" :(339714,False),
}
train_data_balanced = project2Lib.balance_data(train_data, class_mapping)

# extract X and Y from dataframe
Y_train = train_data_balanced["label"].to_numpy()
Y_dev = dev_data["label"].to_numpy()
Y_test = test_data["label"].to_numpy()
X_train = train_data_balanced["preprocess"].values
X_dev = dev_data["preprocess"].values
X_test = test_data["preprocess"].values

In [None]:
# train tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=50000, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)

In [None]:
clf = LogisticRegression(random_state=321, max_iter = 500)

clf.fit(X_train_vectors_tfidf, Y_train)
y_hat_dev = clf.predict(X_dev_vectors_tfidf)
f1 = f1_score(Y_dev, y_hat_dev,average="weighted")
print(f"F1 score: {f1}")
cm = confusion_matrix(Y_dev, y_hat_dev)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

The f1 score = 0.7935124908036201 is slightly worse than without balancing the data. While balancing results in the minory classes to have more true positives (from 1400 to 1564/ from 1413 to 1559) the model is slightly worse for the majority classes since they have less true positives (from 8573 to 8440/ from 8567 to 8291/ from 3135 to 3067). We conclude that wether balancing the data is useful depends on the use case of the classification costs of misclassification for each class. Since the performance metric for this project is f1 score, we will proceed without balancing the data because this lead to slightly better results.

### Different preprocessing options
We further compare the performance of our model (logistic regression) on the dataset using lemmatization with preprocessed data set using stemming and no lemmatization/stemming.  

In [None]:
# Read preprocessed data with stemming
train_data, dev_data, test_data = read_preprocessed_data("stemming_noph")

# extract X and Y from dataframe
Y_train = train_data["label"].to_numpy()
Y_dev = dev_data["label"].to_numpy()
Y_test = test_data["label"].to_numpy()
X_train = train_data["preprocess"].values
X_dev = dev_data["preprocess"].values
X_test = test_data["preprocess"].values

# train tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=50000, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)

clf = LogisticRegression(random_state=321, max_iter = 500)
clf.fit(X_train_vectors_tfidf, Y_train)
y_hat_dev = clf.predict(X_dev_vectors_tfidf)
f1 = f1_score(Y_dev, y_hat_dev,average="weighted")
print(f"F1 score: {f1}")
cm = confusion_matrix(Y_dev, y_hat_dev)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

In [None]:
# Read preprocessed data without no stemming and no lemmatization
train_data, dev_data, test_data = read_preprocessed_data("_noph")

# extract X and Y from dataframe
Y_train = train_data["label"].to_numpy()
Y_dev = dev_data["label"].to_numpy()
Y_test = test_data["label"].to_numpy()
X_train = train_data["preprocess"].values
X_dev = dev_data["preprocess"].values
X_test = test_data["preprocess"].values

# train tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=50000, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)

clf = LogisticRegression(random_state=321, max_iter = 500)
clf.fit(X_train_vectors_tfidf, Y_train)
y_hat_dev = clf.predict(X_dev_vectors_tfidf)
f1 = f1_score(Y_dev, y_hat_dev,average="weighted")
print(f"F1 score: {f1}")
cm = confusion_matrix(Y_dev, y_hat_dev)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

In [None]:
# compare to no preprocessing 
# extract X and Y from dataframe
Y_train = train_data["label"].to_numpy()
Y_dev = dev_data["label"].to_numpy()
Y_test = test_data["label"].to_numpy()
X_train = train_data["sentence"].values
X_dev = dev_data["sentence"].values
X_test = test_data["sentence"].values

# train tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=50000, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)

clf = LogisticRegression(random_state=321, max_iter = 500)
clf.fit(X_train_vectors_tfidf, Y_train)
y_hat_dev = clf.predict(X_dev_vectors_tfidf)
f1 = f1_score(Y_dev, y_hat_dev,average="weighted")
print(f"F1 score: {f1}")
cm = confusion_matrix(Y_dev, y_hat_dev)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

Stemming: 0.791065835354926

NO Stemming/ No Lemmaitzation: 0.8087533127068136

No preprocessing at all: F1 score: 0.8452712703418799


-> Suprising, but because of this we train final models on data with no preprocessing. 

## Train the final model

Note: Sine we want to use the baselinemodel in knowledge distillation, we use keras from now on, because its API is better compatible for that.

In [None]:
# read preprocessed dataset
datasets = ["train", "dev", "test"]
filepaths = []
for i in datasets: 
    filepaths.append(f"./PreprocessedData/{i}_lemmatization_noph.csv")
    
train_data = pd.read_csv(filepaths[0], index_col = 0)
train_data = train_data.fillna('')
dev_data = pd.read_csv(filepaths[1], index_col = 0)
dev_data = dev_data.fillna('')
test_data = pd.read_csv(filepaths[2], index_col = 0)
test_data = test_data.fillna('')

Y_train = train_data["label"].to_numpy()
Y_dev = dev_data["label"].to_numpy()
Y_test = test_data["label"].to_numpy()
X_train = train_data["sentence"].values
X_dev = dev_data["sentence"].values
X_test = test_data["sentence"].values

In [None]:
train_data

In [None]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features=50000, ngram_range=(1,2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_vectors_tfidf = tfidf_vectorizer.transform(X_dev)
X_test_vectors_tfidf =  tfidf_vectorizer.transform(X_test)

In [None]:
# sort indices of sparse matrix otherwise keras gives an error
X_train_vectors_tfidf.sort_indices()
X_dev_vectors_tfidf.sort_indices()
X_test_vectors_tfidf.sort_indices()

In [None]:
def get_logistic_regression(numb_classes=5, numb_features=50000):
    number_of_classes = numb_classes
    number_of_features = numb_features
    lr = Sequential()
    lr.add(Dense(number_of_classes,activation = activations.softmax,input_dim = number_of_features))
    lr.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
    return lr

In [None]:
# Train logistic regression model
lr = get_logistic_regression()
file_path = f"./TrainedModels/tfidf_lr.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')    
early = EarlyStopping(monitor='val_acc', patience=3)
callbacks_list = [checkpoint, early]
lr.fit(X_train_vectors_tfidf, Y_train, epochs=40, validation_data=(X_dev_vectors_tfidf, Y_dev), batch_size=1024, verbose=2, callbacks=callbacks_list)

In [None]:
# Load weights and make final prediction on test set
lr2 = get_logistic_regression()
lr2.load_weights("./TrainedModels/tfidf_lr.h5")

y_hat_test = lr2.predict(X_test_vectors_tfidf)
y_hat_test = np.argmax(y_hat_test, axis=-1)
f1 = f1_score(Y_test, y_hat_test,average="weighted")
acc = accuracy_score(Y_test, y_hat_test)
print(f"F1 score: {f1}")
print(f"Acc score: {acc}")


cm=confusion_matrix(Y_test,y_hat_test,normalize="true")
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

Log Regression: 

F1 score: 0.8462603682499152

Acc score: 0.8482012680975146

In [None]:
# analyze the weights

def print_min_and_max(weights, mapping, num_features):
    max_vals = (-weights).argsort()[:num_features]
    min_vals = (weights).argsort()[:num_features]

    print("-----------Max-Values------------")
    max_words = []
    for i in max_vals:
        if(i>=50000):
            continue
        t = i+1
        max_words.append(mapping[i])
    print(max_words)
    min_words = []
    print("-----------Min-Values------------")
    for i in min_vals:
        if(i>=50000):
            continue
        min_words.append(mapping[i])
    print(min_words)
    return max_words, min_words
    
weights = lr2.get_weights()
weights = weights[0]
mapping = tfidf_vectorizer.get_feature_names()

In [None]:
# for class 0 ()
n = 50
max_0, min_0 = print_min_and_max(weights[:,0], mapping, n)

In [None]:
max_1, min_1 = print_min_and_max(weights[:,1], mapping, n)

In [None]:
max_2, min_2 = print_min_and_max(weights[:,2], mapping, n)

In [None]:
max_3, min_3 = print_min_and_max(weights[:,3], mapping, n)

In [None]:
max_4, min_4 = print_min_and_max(weights[:,4], mapping, n)

In [None]:
overlap_max_01 = len(set(max_0) & set(max_1))
overlap_min_01 = len(set(min_0) & set(min_1))
print(f"Overlap between max weight features {overlap_max_01/ len(max_0)}")
print(f"Overlap between min weight features {overlap_min_01/ len(min_0)}")

In [None]:
overlap_max_03 = len(set(max_0) & set(max_3))
overlap_min_03 = len(set(min_0) & set(min_3))
print(f"Overlap between max weight features {overlap_max_03/ len(max_0)}")
print(f"Overlap between min weight features {overlap_min_03/ len(min_0)}")

### Train on small dataset

In [None]:
# read preprocessed dataset
datasets = ["train", "dev", "test"]
filepaths = []
for i in datasets: 
    filepaths.append(f"./PreprocessedData/{i}_lemmatization_small.csv")
    
train_data_small = pd.read_csv(filepaths[0], index_col = 0)
train_data_small = train_data_small.fillna('')
dev_data_small = pd.read_csv(filepaths[1], index_col = 0)
dev_data_small = dev_data_small.fillna('')
test_data_small = pd.read_csv(filepaths[2], index_col = 0)
test_data_small = test_data_small.fillna('')
Y_train_small = train_data_small["label"].to_numpy()
Y_dev_small = dev_data_small["label"].to_numpy()
Y_test_small = test_data_small["label"].to_numpy()
X_train_small = train_data_small["sentence"].values
X_dev_small = dev_data_small["sentence"].values
X_test_small = test_data_small["sentence"].values

In [None]:
tfidf_vectorizer_small = TfidfVectorizer(use_idf=True, ngram_range=(1,2))
X_train_vectors_tfidf_small = tfidf_vectorizer_small.fit_transform(X_train_small)
X_dev_vectors_tfidf_small = tfidf_vectorizer_small.transform(X_dev_small)
X_test_vectors_tfidf_small =  tfidf_vectorizer_small.transform(X_test_small)

# sort indices of sparse matrix otherwise keras gives an error
X_train_vectors_tfidf_small.sort_indices()
X_dev_vectors_tfidf_small.sort_indices()
X_test_vectors_tfidf_small.sort_indices()

# Train logistic regression model
#lr = get_logistic_regression(5, X_train_vectors_tfidf_small.shape[1])
#file_path = f"./TrainedModels/tfidf_small_lr.h5"
#checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')    
#early = EarlyStopping(monitor='val_acc', patience=3)
#callbacks_list = [checkpoint, early]
#lr.fit(X_train_vectors_tfidf_small, Y_train_small, epochs=40, validation_data=(X_dev_vectors_tfidf_small, Y_dev_small), batch_size=1024, verbose=2, callbacks=callbacks_list)

In [None]:
# Load weights and make final prediction on test set
lr2 = get_logistic_regression(5, X_train_vectors_tfidf_small.shape[1])
lr2.load_weights("./TrainedModels/tfidf_small_lr.h5")

y_hat_test_small = lr2.predict(X_test_vectors_tfidf_small)
y_hat_test_small = np.argmax(y_hat_test_small, axis=-1)
f1 = f1_score(Y_test_small, y_hat_test_small,average="weighted")
acc = accuracy_score(Y_test_small, y_hat_test_small)
print(f"F1 score: {f1}")
print(f"Acc score: {acc}")


cm=confusion_matrix(Y_test_small,y_hat_test_small,normalize="true")
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

## Further adjustments

So far we only used the individual sentences for prediction. Since multiple sentences belong to an abstract we use the additionaly the relative position of a sentence in the abstract.

In [None]:
train_data

In [None]:
l = train_data.iloc[:,2].values.shape[0]
helper = csr_matrix(train_data["line_relative"].values).reshape((l,1))
X_train_vectors_tfidf = csr_matrix(hstack([X_train_vectors_tfidf, helper]))
l = dev_data.iloc[:,2].values.shape[0]
helper = csr_matrix(dev_data["line_relative"].values).reshape((l,1))
X_dev_vectors_tfidf = csr_matrix(hstack([X_dev_vectors_tfidf, helper]))
l = test_data.iloc[:,2].values.shape[0]
helper = csr_matrix(test_data["line_relative"].values).reshape((l,1))
X_test_vectors_tfidf = csr_matrix(hstack([X_test_vectors_tfidf, helper]))

In [None]:
X_train_vectors_tfidf.sort_indices()
X_dev_vectors_tfidf.sort_indices()
X_test_vectors_tfidf.sort_indices()

In [None]:
lr = get_logistic_regression(5,50001)
file_path = f"./TrainedModels/tfidf_lr_linenumber.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')    
early = EarlyStopping(monitor='val_acc', patience=3)
callbacks_list = [checkpoint, early]
lr.fit(X_train_vectors_tfidf, Y_train, epochs=40, validation_data=(X_dev_vectors_tfidf, Y_dev), batch_size=1024, verbose=2, callbacks=callbacks_list)

In [None]:
# Load weights and make final prediction on test set
lr2 = get_logistic_regression(5,50001)
lr2.load_weights("./TrainedModels/tfidf_lr_linenumber.h5")

y_hat_test = lr2.predict(X_test_vectors_tfidf)
y_hat_test = np.argmax(y_hat_test, axis=-1)
f1 = f1_score(Y_test, y_hat_test,average="weighted")
acc = accuracy_score(Y_test, y_hat_test)
print(f"F1 score: {f1}")
print(f"Accuracy score: {acc}")


cm = confusion_matrix(Y_test, y_hat_test, normalize="true")
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

F1 score: 0.8930709607539945

Accuracy score: 0.8937374970331943

In [None]:
weights = lr2.get_weights()
weights = weights[0]
mapping = tfidf_vectorizer.get_feature_names()

In [None]:
# for class 0 ()
n = 15
max_0, min_0 = print_min_and_max(weights[:,0], mapping, n)

In [None]:
max_1, min_1 = print_min_and_max(weights[:,1], mapping, n)

In [None]:
max_2, min_2 = print_min_and_max(weights[:,2], mapping, n)

In [None]:
max_3, min_3 = print_min_and_max(weights[:,3], mapping, n)

In [None]:
max_4, min_4 = print_min_and_max(weights[:,4], mapping, n)

### Train on small dataset

In [None]:
train_data_small

In [None]:
l = train_data_small.iloc[:,2].values.shape[0]
helper = csr_matrix(train_data_small["line_relative"].values).reshape((l,1))
X_train_vectors_tfidf_small = csr_matrix(hstack([X_train_vectors_tfidf_small, helper]))
l = dev_data_small.iloc[:,2].values.shape[0]
helper = csr_matrix(dev_data_small["line_relative"].values).reshape((l,1))
X_dev_vectors_tfidf_small = csr_matrix(hstack([X_dev_vectors_tfidf_small, helper]))
l = test_data_small.iloc[:,2].values.shape[0]
helper = csr_matrix(test_data_small["line_relativeline"].values).reshape((l,1))
X_test_vectors_tfidf_small = csr_matrix(hstack([X_test_vectors_tfidf_small, helper]))

X_train_vectors_tfidf_small.sort_indices()
X_dev_vectors_tfidf_small.sort_indices()
X_test_vectors_tfidf_small.sort_indices()

In [None]:
lr = get_logistic_regression(5, X_train_vectors_tfidf_small.shape[1])
file_path = f"./TrainedModels/tfidf_small_lr_linenumber.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')    
early = EarlyStopping(monitor='val_acc', patience=3)
callbacks_list = [checkpoint, early]
lr.fit(X_train_vectors_tfidf_small, Y_train_small, epochs=50, validation_data=(X_dev_vectors_tfidf_small, Y_dev_small), batch_size=1024, verbose=2, callbacks=callbacks_list)

In [None]:
# Load weights and make final prediction on test set
lr2 = get_logistic_regression(5,X_train_vectors_tfidf_small.shape[1])
lr2.load_weights("./TrainedModels/tfidf_small_lr_linenumber.h5")

y_hat_test_small = lr2.predict(X_test_vectors_tfidf_small)
y_hat_test_small = np.argmax(y_hat_test_small, axis=-1)
f1 = f1_score(Y_test_small, y_hat_test_small,average="weighted")
acc = accuracy_score(Y_test_small, y_hat_test_small)
print(f"F1 score: {f1}")
print(f"Accuracy score: {acc}")


cm = confusion_matrix(Y_test_small, y_hat_test_small, normalize="true")
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()

F1 score: 0.8591555015009821

Accuracy score: 0.8613572258171561