# ElasticNet

## Imports / Settings

In [1]:
%pip install dotenv

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments
import os
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

2025-05-18 19:34:47.292611: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-18 19:34:47.292704: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-18 19:34:47.294335: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-18 19:34:47.306628: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
IS_PAPERSPACE = os.getcwd().startswith('/notebooks')
dir_env = os.path.join(os.getcwd(), '.env') if IS_PAPERSPACE else os.path.join(os.getcwd(), '..', '.env')
_ = load_dotenv(dotenv_path=dir_env)

In [4]:
wandb.login(key=os.getenv('WANDB_KEY'))
wandb._disable_jupyter = True

[34m[1mwandb[0m: Currently logged in as: [33mdario-wigger[0m ([33mnlp-lantsch-schmassmann-wigger[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Data

In [5]:
df_traindata = pd.read_csv("data/train.csv") if IS_PAPERSPACE else pd.read_csv("../data/train.csv")

df_testdata = pd.read_csv("data/test.csv") if IS_PAPERSPACE else pd.read_csv("../data/test.csv")
test_texts = df_testdata['text'].tolist()
test_labels = df_testdata['sentiment'].tolist()

## Vectorization

In [6]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(df_traindata['text'])

In [7]:
train_texts, validation_texts, train_labels, validation_labels = train_test_split(X, df_traindata['sentiment'], test_size=0.2, random_state=42)

## Training

In [14]:
model = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.5,
    C=1.0,
    max_iter=1000,
)
model.fit(train_texts, train_labels)

In [8]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='weighted')
    report = classification_report(y, y_pred, output_dict=True)

    wandb.init(project="nlp-lantsch-schmassmann-wigger", entity="nlp-lantsch-schmassmann-wigger")
    wandb.log({
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "classification_report": report,
    })
    wandb.finish()

    return precision, recall, f1

In [None]:
evaluate_model(model, test_texts, test_labels)

0,1
f1,▁
precision,▁
recall,▁

0,1
classification_report,precis...
f1,0.76878
precision,0.7705
recall,0.77144


## Hyperparameter Tuning

In [11]:
param_grid = {
    'C': [0.5, 1.0],
    'l1_ratio': [0.4, 0.5, 0.6],
}
grid = GridSearchCV(
    LogisticRegression(
        penalty='elasticnet', 
        solver='saga', 
        max_iter=1000),
        param_grid, 
        cv=3, 
        scoring='precision_macro',
        n_jobs=-1)
grid.fit(train_texts, train_labels)

In [12]:
print("Best Parameters:", grid.best_params_)

Best Parameters: {'C': 1.0, 'l1_ratio': 0.5}


In [18]:
best_model = grid.best_estimator_
evaluate_model(best_model, vectorizer.transform(test_texts), test_labels)

0,1
f1,▁
precision,▁
recall,▁

0,1
f1,0.76585
precision,0.76785
recall,0.76866


(0.7678524739128291, 0.7686615258116847, 0.7658472997574297)