In [1]:
import os
import pandas as pd
import  seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
print(df[['airline_sentiment', 'text']].head())
print(df['airline_sentiment'].value_counts())

  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


## Text Cleaning

In [4]:
import re
import nltk
import emoji
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = emoji.demojize(text) # Convert emojis to text
    text = re.sub(r'http\S+|@\S+|#\S+', '', text) # Remove URLs, mentions, and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)       # Remove special characters
    text = text.lower()
    tokens = text.split()
    return ' '.join([word for word in tokens if word not in stop_words])

df['text'] = df['text'].apply(preprocess_text)
df['airline_sentiment'] = df['airline_sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})
print(df[['airline_sentiment', 'text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sprua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   airline_sentiment                                               text
0                  1                                               said
1                  2      plus youve added commercials experience tacky
2                  1       didnt today must mean need take another trip
3                  0  really aggressive blast obnoxious entertainmen...
4                  0                               really big bad thing


## Traditional ML

### Vectorization using TF-VDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['text']).toarray()
y = df['airline_sentiment']


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
GS = GridSearchCV(lr, param_grid={'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga']}, cv=3)
GS.fit(X_train, y_train)

print("Best parameters found: ", GS.best_params_)

model = GS.best_estimator_
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

lr_cnf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(lr_cnf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Logistic Regression Confusion Matrix')

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)

RS = RandomizedSearchCV(rf, param_distributions={'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}, n_iter=5, cv=3)
RS.fit(X_train, y_train)
print("Best parameters found: ", RS.best_params_)
rf_model = RS.best_estimator_
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))
rf_cnf_mat = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(rf_cnf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Random Forest Confusion Matrix')

In [None]:
from sklearn.svm import SVC

svc = SVC(random_state=42, class_weight='balanced')

RS = RandomizedSearchCV(svc, param_distributions={'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']}, n_iter=5, cv=3)
RS.fit(X_train, y_train)
print("Best parameters found: ", RS.best_params_)
svc_model = RS.best_estimator_
svc_model.fit(X_train, y_train)

y_pred_svc = svc_model.predict(X_test)
print(classification_report(y_test, y_pred_svc))
svc_cnf_mat = confusion_matrix(y_test, y_pred_svc)
sns.heatmap(svc_cnf_mat, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Random Forest Confusion Matrix')

#### BERT

In [13]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from datasets import Dataset
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['airline_sentiment'], random_state=42)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset = Dataset.from_pandas(train_df[['text', 'airline_sentiment']])
train_dataset = train_dataset.map(tokenize, batched=True)
train_dataset = train_dataset.rename_column('airline_sentiment', 'labels')
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

test_dataset = Dataset.from_pandas(test_df[['text', 'airline_sentiment']])
test_dataset = test_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.rename_column('airline_sentiment', 'labels')
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/11712 [00:00<?, ? examples/s]

Map:   0%|          | 0/2928 [00:00<?, ? examples/s]

In [14]:
train_dataset, test_dataset

(Dataset({
     features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 11712
 }),
 Dataset({
     features: ['text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 2928
 }))

In [15]:
class_counts = torch.tensor([9178, 3099, 2363], dtype=torch.float)
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum()

class WeightedBERT(DistilBertForSequenceClassification):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

model = WeightedBERT.from_pretrained("distilbert-base-uncased", num_labels=3)

os.makedirs('./results', exist_ok=True)
os.makedirs('./logs', exist_ok=True)
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy = "epoch",
    save_strategy = "epoch",
    logging_dir='./logs',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)


Some weights of WeightedBERT were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    roc_auc = roc_auc_score(labels, preds, multi_class='ovr')
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=collator
)

trainer.train()


Epoch,Training Loss,Validation Loss


: 

In [None]:
results = trainer.evaluate()
print(results)
