In [1]:
import pandas as pd
import numpy as np
import nltk
import joblib
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

# NLTK setup
nltk.download('punkt')
# change the path to your nltk_data directory as needed
nltk.data.path.append(r"../nltk_data")

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load word2vec_tf-idf features and labels
# change the paths to your datasets as needed
train_features = sparse.load_npz(r"../datasets/tfidf_train_features.npz")
test_features = sparse.load_npz(r"../datasets/tfidf_test_features.npz")
train_labels = pd.read_csv(r"../datasets/tfidf_train_labels.csv")
test_df = pd.read_csv(r"../datasets/tfidf_test_labels.csv")
print("Loaded train/test features and labels.")


Loaded train/test features and labels.


In [3]:
# Load features and labels
X = train_features
y = train_labels['target'].values

# Split into train/validation (80/20) for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train logistic regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

In [4]:
# Evaluate
y_pred = lr.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))

Accuracy: 0.8135259356533159
F1 Score: 0.760539629005059

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.91      0.85       869
           1       0.85      0.69      0.76       654

    accuracy                           0.81      1523
   macro avg       0.82      0.80      0.80      1523
weighted avg       0.82      0.81      0.81      1523


Confusion Matrix:
 [[788  81]
 [203 451]]


In [5]:
# Save evaluations for comparison
model_name = "Logistic Regression"

metrics = {
    "model": model_name,
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred, average="binary"),
    "recall": recall_score(y_val, y_pred, average="binary"),
    "f1score": f1_score(y_val, y_pred, average="binary")
}

scores = pd.read_csv("../evaluation/scores.csv")

match = scores["model"] == metrics["model"]
if match.any():
    # Update existing row
    scores.loc[match, ["accuracy", "precision", "recall", "f1score"]] = metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1score"]
else:
    # Insert new row
    scores = pd.concat([scores, pd.DataFrame([metrics])], ignore_index=True)

scores.to_csv("../evaluation/scores.csv", index=False)

print("Model scores saved to evaluation/scores.csv")

Model scores saved to evaluation/scores.csv


In [6]:
# Save the model for modular use
joblib.dump(lr, "logistic_regression_model.pkl")
print("Model saved as logistic_regression_model.pkl")

Model saved as logistic_regression_model.pkl


In [7]:
# Predict on test set (for submission or inspection)
test_preds = lr.predict(test_features)

# Save predictions
submission = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else range(len(test_preds)),
    'target': test_preds
})
submission.to_csv("logistic_regression_submission.csv", index=False)
print("Test predictions saved to logistic_regression_submission.csv")

Test predictions saved to logistic_regression_submission.csv
