In [None]:
#!wget -nc https://lazyprogrammer.me/course_files/AirlineTweets.csv

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

import torch
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
dfraw = pd.read_csv('AirlineTweets.csv')
print (dfraw.columns)
print (dfraw.shape)
dfraw.head()

In [None]:
df = dfraw[['airline_sentiment', 'text']].copy()
df.head()

In [None]:
df['airline_sentiment'].hist()

In [None]:
target_map = {'negative': 0, 'neutral': 2, 'positive': 1}
df['target'] = df.airline_sentiment.map(target_map)

df2 = df[['text', 'target']].copy()
df2.columns = ['sentence', 'label']
df2.to_csv('data.csv', index=False)
!head data.csv

In [None]:
from datasets import load_dataset, load_metric
raw_dataset = load_dataset('csv', data_files='data.csv')
raw_dataset

In [None]:
split = raw_dataset['train'].train_test_split(test_size=0.3, seed=42)
split

In [None]:
# if you have multiple csv files
# load_dataset('csv', data_files=['file1.csv', 'file2.csv'])

# if you already have a train-test split:
# load_dataset(
#     'csv',
#     data_files={'train': ['train1.csv', 'train2.csv'],
#                 'test': 'test.csv'
#     }
# )

In [None]:
from transformers import AutoTokenizer
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True)
tokenized_datasets = split.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
from torchinfo import summary
summary(model)

In [None]:
training_args = TrainingArguments(
    output_dir='training_dir',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
)

In [None]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(y_true=labels, y_pred=predictions),
        'f1': f1_score(y_true=labels, y_pred=predictions, average='macro')
    }

In [None]:
trainer = Trainer(
    model, 
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
!ls training_dir

In [None]:
from transformers import pipeline
savemodel = pipeline('text-classification', model='training_dir/checkpoint-963', device=0)

In [None]:
split['test']

In [None]:
test_pred = savemodel(split['test']['sentence'])
print (len(test_pred))
test_pred[0]

In [None]:
def get_label(d):
    return int(d['label'].split('_')[-1])

test_pred = [get_label(d) for d in test_pred]

In [None]:
print ("acc:", accuracy_score(split['test']['label'], test_pred))

In [None]:
print ("f1:", f1_score(split['test']['label'], test_pred, average='macro'))

In [None]:
def plot_cm(cm):
    classes = ['negative', 'positive', 'neutral']
    df_cm = pd.DataFrame(cm, index=classes, columns=classes)
    ax = sn.heatmap(df_cm, annot=True, fmt='g')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    
cm = confusion_matrix(split['test']['label'], test_pred, normalize='true')
plot_cm(cm)