In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data.dataloader import DataLoader
import tiktoken

from src.data_loader import load_data
from src.utils import load_config
from src import wikipedia_article_dataset
from src.models import CNN, CNNModel, MultilabelCNNModel
from src.evaluation import evaluate_model

## Binary Classification

In [None]:
MAX_LENGTH = 400
BATCH_SIZE = 16

EMBEDDING_DIM = 128
NUM_FILTERS = 10
FILTER_SIZES = [3, 4, 5]
DROPOUT = 0.5

In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
from src.utils import load_config
from main import run_preprocessing_pipeline
from sklearn.model_selection import train_test_split

config = load_config("just-load")

X, y = run_preprocessing_pipeline(config)

y = y["label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
2025-03-06 21:32:09,486 - src.utils - INFO - Loading config from configs/just-load.yaml.
2025-03-06 21:32:09,500 - src.data_loader - INFO - Loading data with {'good_file': '/Users/robinsuxdorf/Documents/Uni/FernUni Hagen/4. Semester/Projektpraktikum/github/good.csv', 'promo_file': '/Users/robinsuxdorf/Documents/Uni/FernUni Hagen/4. Semester/Projektpraktikum/github/promotional.csv', 'shuffle': False, 'nrows': 250, 'save': 'loaded_data_binary.csv'}
2025-03-06 21:32:09,512 - src.data_loader - INFO - Loading non-promotional and promotional data for binary classification.
2025-03-06 21:32:09,752 - src.utils - INFO - Data saved to data/intermediary/loaded_data_binary.csv.
2025-03-06 21:32:09,755 - src.preprocessing - INFO - Preprocessing data with {'remove_non_word': True, 'convert_lowercase': Tru

In [5]:
from src.wikipedia_article_dataset import WikipediaArticleDataset
from torch.utils.data.dataloader import DataLoader

train_dataset = WikipediaArticleDataset(X_train, y_train)
test_dataset = WikipediaArticleDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, 16)
test_dataloader = DataLoader(test_dataset, 16)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(
        self,
        input_dim: int,
        num_filters: int,
        num_classes: int,
        dropout: float = 0.5
    ) -> None:
        super().__init__()
        self.fc1 = nn.Linear(input_dim, num_filters)
        self.fc2 = nn.Linear(num_filters, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [None]:
import torch.optim as optim

model = CNN(
    10000,
    512,
    2
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
def _train_one_epoch(
    train_dataloader: DataLoader, optimizer: optim.Optimizer
) -> float:
    model.train()
    total_loss = 0.0
    for inputs, labels in train_dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device).type(torch.long)

        optimizer.zero_grad()

        logits = model(inputs)

        loss = criterion(logits, labels)
        loss.backward()

        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_dataloader)

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    avg_loss = _train_one_epoch(train_dataloader, optimizer)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [None]:
def binary_predict_fn(logits: torch.Tensor) -> torch.Tensor:
    return torch.argmax(logits, dim=1)

def predict(features) -> list:
    tensors = [
        torch.tensor(article.toarray().squeeze(), dtype=torch.float, device=device)
        for article in features
    ]
    
    input_batch = torch.stack(tensors)

    model.eval()
    with torch.no_grad():
        logits = model(input_batch)
        predictions = binary_predict_fn(logits)

    return predictions.cpu().tolist()

In [None]:
from sklearn.metrics import (
    accuracy_score
)

y_pred = predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

accuracy

In [None]:
model.fit(X_train, y_train, 0.001, 3, BATCH_SIZE)

In [None]:
predictions = model.predict(X_test)

In [None]:
evaluate_model(model, X_test, y_test)

## Multilabel Classification

In [None]:
config = load_config("just-load")
df = load_data(config["data_loader"], "multilabel")
df

In [None]:
model = MultilabelCNNModel(
    EMBEDDING_DIM,
    NUM_FILTERS,
    FILTER_SIZES,
    MAX_LENGTH
)

texts = df["text"].tolist()

columns = ["advert", "coi", "fanpov", "pr", "resume"]
labels = df[columns].values.tolist()

In [None]:
model.fit(texts, labels, learning_rate=0.01, num_epochs=3, batch_size=16)

In [None]:
predictions = model.predict(texts)

In [None]:
for i, col in enumerate(columns):
    pred_col = [p[i] for p in predictions]
    label_col = [l[i] for l in labels]

    print(f"Classification Report for {col}:")
    print(classification_report(label_col, pred_col))