In [188]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

target_names = [
    "Big Tech & Startups",
    "Science & Futuristic Technology",
    "Programming, Design & Data Science",
]

In [189]:
def load_datasets(prefix):
    train_df = pd.read_pickle(f'data/{prefix}_training.pkl')
    validation_df = pd.read_pickle(f'data/{prefix}_validation.pkl')
    test_df = pd.read_pickle(f'data/{prefix}_test.pkl')

    return train_df, validation_df, test_df

In [190]:
# Load the datasets
articles_train_df, articles_validation_df, articles_test_df = load_datasets("articles")
sentences_train_df, sentences_validation_df, sentences_test_df = load_datasets("sentences")

# Prepare the data and labels for articles
articles_training_data = articles_train_df['text'].tolist()
articles_training_labels = articles_train_df['category_code'].tolist()
articles_test_data = articles_test_df['text'].tolist()
articles_test_labels = articles_test_df['category_code'].tolist()
articles_validation_data = articles_validation_df['text'].tolist()
articles_validation_labels = articles_validation_df['category_code'].tolist()


# Prepare the data and labels for sentences
sentences_training_data = sentences_train_df['text'].tolist()
sentences_training_labels = sentences_train_df['category_code'].tolist()
sentences_test_data = sentences_test_df['text'].tolist()
sentences_test_labels = sentences_test_df['category_code'].tolist()
sentences_validation_data = sentences_validation_df['text'].tolist()
sentences_validation_labels = sentences_validation_df['category_code'].tolist()

In [191]:
def train_and_evaluate_multinomial_nb(training_data, training_labels, test_data, test_labels):
    vectorizer = CountVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    classifier = MultinomialNB()
    classifier.fit(training_features, training_labels)
    predictions = classifier.predict(test_features)

    return classification_report(test_labels, predictions, target_names=target_names)

In [192]:
def train_and_evaluate_logistic_regression(training_data, training_labels, test_data, test_labels):
    vectorizer = CountVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression())
    pipeline.fit(training_features, training_labels)
    predictions = pipeline.predict(test_features)

    return classification_report(test_labels, predictions, target_names=target_names)

In [193]:
def train_and_evaluate_tfidf_logistic_regression(training_data, training_labels, test_data, test_labels):
    vectorizer = TfidfVectorizer()
    training_features = vectorizer.fit_transform(training_data)
    test_features = vectorizer.transform(test_data)

    classifier = LogisticRegression()
    classifier.fit(training_features, training_labels)
    predictions = classifier.predict(test_features)

    return classification_report(test_labels, predictions, target_names=target_names)

In [194]:
def rename_labels(dataset):
    words = {'train','validation','test'}
    try:
        for word in words:
            dataset[word]=dataset[word].rename_column ("category_code", "label")
    except:
        pass  

In [195]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [196]:
from transformers import AutoTokenizer

def tokenize_function(examples):

    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [197]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [198]:
import pyarrow as pa
import datasets
from datasets import Dataset

def create_dataset(training_df, validation_df, test_df):
    training_df = training_df.head(300)
    validation_df = validation_df.head(300)
    test_df = test_df.head(275)
    dataset = datasets.DatasetDict({
        "train":Dataset(pa.Table.from_pandas(training_df)),
        "validation":Dataset(pa.Table.from_pandas(validation_df)),
        "test":Dataset(pa.Table.from_pandas(test_df))})
    rename_labels(dataset)
    return dataset

In [199]:
import numpy as np

def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [200]:
from transformers import Trainer

def get_trainer(model, training_args, tokenized_dataset):
    return Trainer(

        model=model,

        args=training_args,

        train_dataset=tokenized_dataset["train"],

        eval_dataset=tokenized_dataset["validation"],

        compute_metrics=compute_metrics,

    )

In [201]:
from transformers import pipeline

def get_text_classifier(tokenizer, pretrained_model):
    return pipeline(

        task="text-classification", model=f'./{pretrained_model}_tldr',
        tokenizer = tokenizer,

    )

In [202]:
from sklearn.metrics import classification_report

def bert_evaluate(tokenized_dataset, text_classifier):
  verbose = True
  full = True

  test_subset = tokenized_dataset["test"]

  if not full:
    N=20
    test_subset=test_subset.shuffle(seed=77).select(range(N))

  y_pred = []
  y_true = []

  for num in range(len(test_subset)):

    #if num % 10 == 0:
    #  print ("Ho classificato "+str(num)+" su "+str(len(test_subset))+" esempi")

    preds = text_classifier(test_subset[num]['text'][0:512])
    y_pred.append(int(preds[0]['label'].split('_')[-1]))
    y_true.append(int(test_subset[num]['label']))
    #if verbose:  
    #  if int(y_true[-1]) != int(y_pred[-1]):
    #    print ('classificazione: ', target_names[y_pred[-1]])
    #    print ('ground truth: ', target_names[y_true[-1]])
    #    print (test_subset[num]['text'])
  print(classification_report(y_true, y_pred, target_names=target_names))

In [203]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [204]:
import evaluate
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments


def train_and_evaluate_bert(training_data, validation_data, test_data):
    global tokenizer
    global metric
    
    pretrained_model = "bert-base-multilingual-uncased"
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    dataset = create_dataset(training_data, validation_data, test_data)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    metric = evaluate.load("accuracy")
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=4)
    training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
    trainer = get_trainer(model, training_args, tokenized_dataset)
    trainer.train()
    trainer.save_model(pretrained_model+"_tldr")    
    text_classifier = get_text_classifier(tokenizer, pretrained_model)
    bert_evaluate(tokenized_dataset, text_classifier)

In [205]:
print("MultinomialNB for articles:")
print(train_and_evaluate_multinomial_nb(articles_training_data, articles_training_labels, articles_test_data, articles_test_labels))

print("Logistic Regression for articles:")
print(train_and_evaluate_logistic_regression(articles_training_data, articles_training_labels, articles_test_data, articles_test_labels))

print("TF-IDF Logistic Regression for articles:")
print(train_and_evaluate_tfidf_logistic_regression(articles_training_data, articles_training_labels, articles_test_data, articles_test_labels))

print("Bert for articles:")
print(train_and_evaluate_bert(articles_train_df, articles_validation_df, articles_test_df))

MultinomialNB for articles:
                                    precision    recall  f1-score   support

               Big Tech & Startups       0.89      0.96      0.92        75
   Science & Futuristic Technology       0.99      0.86      0.92        78
Programming, Design & Data Science       0.96      0.99      0.98       123

                          accuracy                           0.95       276
                         macro avg       0.94      0.94      0.94       276
                      weighted avg       0.95      0.95      0.95       276

Logistic Regression for articles:
                                    precision    recall  f1-score   support

               Big Tech & Startups       0.89      0.89      0.89        75
   Science & Futuristic Technology       0.93      0.83      0.88        78
Programming, Design & Data Science       0.92      0.98      0.95       123

                          accuracy                           0.92       276
                     

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.283822,0.938182
2,No log,0.208001,0.934545
3,No log,0.239922,0.930909


275
                                    precision    recall  f1-score   support

               Big Tech & Startups       0.80      0.95      0.87        75
   Science & Futuristic Technology       0.95      0.77      0.85        77
Programming, Design & Data Science       0.98      0.98      0.98       123

                          accuracy                           0.91       275
                         macro avg       0.91      0.90      0.90       275
                      weighted avg       0.92      0.91      0.91       275

None


In [206]:
print("MultinomialNB for sentences:")
print(train_and_evaluate_multinomial_nb(sentences_training_data, sentences_training_labels, sentences_test_data, sentences_test_labels))

print("Logistic Regression for sentences:")
print(train_and_evaluate_logistic_regression(sentences_training_data, sentences_training_labels, sentences_test_data, sentences_test_labels))

print("TF-IDF Logistic Regression for sentences:")
print(train_and_evaluate_tfidf_logistic_regression(sentences_training_data, sentences_training_labels, sentences_test_data, sentences_test_labels))

print("Bert for sentences:")
print(train_and_evaluate_bert(sentences_train_df, sentences_validation_df, sentences_test_df))

MultinomialNB for sentences:
                                    precision    recall  f1-score   support

               Big Tech & Startups       0.83      0.86      0.85       406
   Science & Futuristic Technology       0.88      0.83      0.86       415
Programming, Design & Data Science       0.90      0.92      0.91       541

                          accuracy                           0.88      1362
                         macro avg       0.87      0.87      0.87      1362
                      weighted avg       0.88      0.88      0.88      1362

Logistic Regression for sentences:
                                    precision    recall  f1-score   support

               Big Tech & Startups       0.83      0.81      0.82       406
   Science & Futuristic Technology       0.84      0.79      0.81       415
Programming, Design & Data Science       0.87      0.92      0.89       541

                          accuracy                           0.85      1362
                   

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.78217,0.703333
2,No log,0.548378,0.8
3,No log,0.497528,0.813333


275
                                    precision    recall  f1-score   support

               Big Tech & Startups       0.70      0.86      0.78        80
   Science & Futuristic Technology       0.89      0.82      0.86        80
Programming, Design & Data Science       0.90      0.81      0.85       115

                          accuracy                           0.83       275
                         macro avg       0.83      0.83      0.83       275
                      weighted avg       0.84      0.83      0.83       275

None
