In [None]:
########## BERT ##########

!pip install transformers datasets evaluate accelerate -q #Installs everything we might need
from google.colab import drive
print("Starting...")
drive.mount('/content/drive')
print("Drive mounted successfully.")

In [None]:



import pandas as pd
import os

CSV_PATH = "/content/drive/MyDrive/CS Projects/Winter Project 2526/Data/raw/bbc_data.csv" #Sets the path to the CSV file

if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH) #Checks and loads the CSV file
    df = df.rename(columns={'data': 'text', 'labels': 'label_name'}) #Renames the columns

    print(f"Total documents loaded: {len(df)}")
    print("\nFirst 5 rows (Check names!):")
    print(df.head())
    print("\nClass distribution:")
    print(df['label_name'].value_counts())
else:
    print(f"ERROR: CSV file not found at: {CSV_PATH}")
    print("Please check your Google Drive path and file name.")



In [None]:
!pip install transformers datasets

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #Loads the tokenizer

if 'df' in globals() and not df.empty: #Tests on the first sample text from the dataframe
    sample_text = df['text'].iloc[0]

    tokenized_output = tokenizer(sample_text,
                                 padding=True,
                                 truncation=True)

    print("\n--- Tokenizer Check Successful ---")
    print("Sample Text:", sample_text)
    print("Input IDs:", tokenized_output['input_ids'][:10])
else:
    print("DataFrame 'df' not found or is empty. Cannot test tokenizer.")

In [None]:
unique_labels = df['label_name'].unique().tolist() #Shows every text category
print(f"Original Unique Labels: {unique_labels}")

label_map = {label: i for i, label in enumerate(unique_labels)} #Creates a mapping between labels and integers
id_to_label = {i: label for i, label in enumerate(unique_labels)} #Creates a reverse mapping between integers and labels

print(f"Created Label-to-ID Mapping: {label_map}")

df['label'] = df['label_name'].map(label_map) #Applies the mapping to the dataframe

print("\nVerification of New Labels:") #Verifies the labels
print(df[['label_name', 'label']].head())
print(df['label'].value_counts())

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

hf_dataset = Dataset.from_pandas(df, preserve_index=False) #Converts Pandas dataframe to Hugging Face Dataset


train_test_split = hf_dataset.train_test_split(test_size=0.2, seed=42) #Splits the dataset. 80% for training, 20% for testing/validation

test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42) #Splits the 20% set into 10% validation and 10% test

dataset_dict = DatasetDict({ #Combines the splits into a DatasetDict object
    'train': train_test_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
})

print("\n--- Dataset Split Information ---")
print(dataset_dict)
print(f"Train samples: {len(dataset_dict['train'])}")
print(f"Validation samples: {len(dataset_dict['validation'])}")
print(f"Test samples: {len(dataset_dict['test'])}")


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #Loads the tokenizer

def tokenize_function(examples): #Function to apply tokenization to several examples
    return tokenizer(examples['text'], truncation=True, padding='max_length')

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True) #Applies the function to the entire DatasetDict


tokenized_datasets = tokenized_datasets.remove_columns(['text', 'label_name']) #Removes the original, now unnecessary columns

tokenized_datasets = tokenized_datasets.rename_column("label", "labels") #Renames the label column to 'labels'

tokenized_datasets.set_format("torch") #Sets the format to PyTorch tensors

print("\n--- Final Tokenized Dataset (Ready for Training) ---")
print(tokenized_datasets)
print(tokenized_datasets['train'].features)

In [None]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(eval_pred): #Defines evaluation metrics
    predictions, labels = eval_pred

    preds = np.argmax(predictions, axis=-1) #Converts logits to predicted class IDs

    f1 = f1_score(labels, preds, average='weighted') #Calculates the weighted F1 score, as well as the accuracy
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1_weighted': f1,
    }

num_labels = 5 #Initializes the model
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
)

training_args = TrainingArguments( #Training configuration
    output_dir='./results',
    num_train_epochs=3,

    #Memory optimization
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,

    dataloader_num_workers=4, #Uses 4 CPU cores to prepare the next batch in parallel, making it faster

    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to='none',
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model='f1_weighted',
    greater_is_better=True,
    learning_rate=2e-5,
    fp16=True #Enables faster training on the T4 GPU
)


trainer = Trainer( #Trainer Initialization
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
)

print("\nStarting Fine-Tuning on T4 GPU...")
trainer.train()


model_dir = "/content/drive/MyDrive/CS Projects/Winter Project 2526/Models/Bert Model" #Saves the final best model
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    
trainer.save_model(model_dir)
print(f"\nBERT Model saved to Drive at: {model_dir}")

In [None]:
########## LOGISTIC REGRESSION ##########

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, matthews_corrcoef
import joblib
import os


train_texts = [item['text'] for item in dataset_dict['train']] #Prepares the data
test_texts = [item['text'] for item in dataset_dict['test']]
train_labels = [item['label'] for item in dataset_dict['train']]
test_labels = [item['label'] for item in dataset_dict['test']]

vectorizer = TfidfVectorizer(max_features=5000) #Vectorization
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_test_tfidf = vectorizer.transform(test_texts)

simple_model = LogisticRegression(max_iter=1000) #Trains the model
simple_model.fit(X_train_tfidf, train_labels)


simple_preds = simple_model.predict(X_test_tfidf) #Evaluates the model
simple_acc = accuracy_score(test_labels, simple_preds)
simple_mcc = matthews_corrcoef(test_labels, simple_preds)

print(f"Results for Logistic Regression Model:")
print(f"Accuracy: {simple_acc:.4f}")
print(f"MCC: {simple_mcc:.4f}\n")
print(classification_report(test_labels, simple_preds))

model_dir = '/content/drive/MyDrive/CS Projects/Winter Project 2526/Models/LogReg/' #Saves to Google Drive
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

joblib.dump(simple_model, os.path.join(model_dir, 'logreg_model.pkl'))
joblib.dump(vectorizer, os.path.join(model_dir, 'tfidf_vectorizer.pkl'))

print(f"\nLogistic Regression Model saved to Drive at: {model_dir}")

In [None]:
########## LSTM ##########

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

max_words = 10000 #Prepares the data
max_len = 200 #Each article only gets 200 words
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=max_len)


model_lstm = Sequential([ #Builds the LSTM model architecture
    Embedding(max_words, 128, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(5, activation='softmax')
])

model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("Training the LSTM...") #Trains the model
model_lstm.fit(X_train_seq, np.array(train_labels), epochs=5, batch_size=32, verbose=1)

lstm_loss, lstm_acc = model_lstm.evaluate(X_test_seq, np.array(test_labels)) #Evaluates the model
print(f"LSTM Accuracy: {lstm_acc:.4f}")

model_lstm.save('my_lstm_model.h5')

In [None]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
import os

raw_preds = model_lstm.predict(X_test_seq) #Converts the probaibilities into class predictions
lstm_preds = np.argmax(raw_preds, axis=1)

lstm_acc = accuracy_score(test_labels, lstm_preds) #Calculates accuracy and MCC
lstm_mcc = matthews_corrcoef(test_labels, lstm_preds)

print(f"\nResults for LSTM Model:") #Results
print(f"Accuracy: {lstm_acc:.4f}")
print(f"MCC: {lstm_mcc:.4f}")

model_dir = "/content/drive/MyDrive/CS Projects/Winter Project 2526/Models/LSTM/"


if not os.path.exists(model_dir):
    os.makedirs(model_dir)



file_dir = "/content/drive/MyDrive/CS Projects/Winter Project 2526/Models/LSTM/lstm_model.keras" #Saves the model to Google Drive



model_lstm.save(file_dir)
print(f"\nLSTM Model saved to Drive at: {file_dir}")