# Part 2
### 99106511 | Pouya Farivar

First lets load the training data:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
#loading the data
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='Latin', header=None)

columns = ['target', 'id', 'datetime', 'query', 'account', 'tweet']
data.columns = columns

data.head()

Unnamed: 0,target,id,datetime,query,account,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
import spacy
from transformers import pipeline


This is the cleaning process, i have done url, hashtag, cashtag and, metion removal. I have also removed the stop words. finally a lemitation function is used on the data tokens. Simple, But necessary.

In [8]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split


# Step 1: Cleaning the data
def clean_text(text):
    # Removing urls
    text = re.sub(r'http\S+', '', text)
    # Removing user mentions (e.g., @username)
    text = re.sub(r'@\w+', '', text)
    # Removing hashtags
    text = re.sub(r'#\w+', '', text)
    # Removing cashtags
    text = re.sub(r'\$[^\s]+', '', text)  
    # Removing punctuation 
    text = text.translate(str.maketrans("", "", string.punctuation))
    # lower case
    text = text.lower()
    # removing stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Removing duplicates and useless data
data = data.drop_duplicates(subset=['tweet', 'target'])
data = data.dropna()

# Step 2: Crwating a pipeline with lemmitization

def preprocess_text(text):
    tokens = word_tokenize(text)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Finalizing and creating the train, val, test 
data['clean_tweet']=  data['tweet'].apply(clean_text)
data['preprocessed_tweet'] = data['clean_tweet'].apply(preprocess_text)

X = data['preprocessed_tweet']
y = data['target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


Now we do Bag of word and tf-idf models with parameter tuning. Note that the there were alot of parameters in the tuning provcess this is the final pick.

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Bag-of-Words (BoW) model pipeline
bow_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# There are alot of parameters but i only tune the two most important ones.
bow_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],  
    'vectorizer__max_features': [10000, 100000]
}

# grid search
bow_grid_search = GridSearchCV(bow_pipeline, bow_params, cv=5, scoring='accuracy')
bow_grid_search.fit(X_train, y_train)

# TF-IDF model pipeline
tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

#tuning
tfidf_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__max_features': [10000, 100000]
}

tfidf_grid_search = GridSearchCV(tfidf_pipeline, tfidf_params, cv=5, scoring='accuracy')
tfidf_grid_search.fit(X_train, y_train)

print("Best parameters for BoW model:", bow_grid_search.best_params_)
print("Best parameters for TF-IDF model:", tfidf_grid_search.best_params_)

# accuracies
bow_val_predictions = bow_grid_search.predict(X_val)
tfidf_val_predictions = tfidf_grid_search.predict(X_val)

accuracy_bow_val = accuracy_score(y_val, bow_val_predictions)
accuracy_tfidf_val = accuracy_score(y_val, tfidf_val_predictions)

print("Accuracy on validation set (BoW):", accuracy_bow_val)
print("Accuracy on validation set (TF-IDF):", accuracy_tfidf_val)

# test evaluation
bow_test_predictions = bow_grid_search.predict(X_test)
tfidf_test_predictions = tfidf_grid_search.predict(X_test)

accuracy_bow_test = accuracy_score(y_test, bow_test_predictions)
accuracy_tfidf_test = accuracy_score(y_test, tfidf_test_predictions)

print("Accuracy on test set (BoW):", accuracy_bow_test)
print("Accuracy on test set (TF-IDF):", accuracy_tfidf_test)


Best parameters for BoW model: {'vectorizer__max_features': 100000, 'vectorizer__ngram_range': (1, 2)}
Best parameters for TF-IDF model: {'vectorizer__max_features': 100000, 'vectorizer__ngram_range': (1, 2)}
Accuracy on validation set (BoW): 0.7784920028540939
Accuracy on validation set (TF-IDF): 0.7770523271599871
Accuracy on test set (BoW): 0.7801540695838859
Accuracy on test set (TF-IDF): 0.7787017743259456


Here i make and tune the bret model using libraries from the hugging face community. Note that the tuning process has stopped using a keybord intrupt because of the huge amount of time it required to finalize on the m1.

In [82]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch

# Tokenizing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt')
X_val_tokens = tokenizer(list(X_val), padding=True, truncation=True, return_tensors='pt')

# labels to 0 and 1 and then to tensors
y_train = [1 if x == 4 else x for x in y_train]
y_val = [1 if x == 4 else x for x in y_val]

y_train_tensor = torch.tensor(list(y_train))
y_val_tensor = torch.tensor(list(y_val))

# Making the loaders in order to tune the model
train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], y_train_tensor)
val_dataset = TensorDataset(X_val_tokens['input_ids'], X_val_tokens['attention_mask'], y_val_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Loading the base BERT model 
model = BertForSequenceClassification.from_pretrained('bert-base', num_labels=2) 

# Training params
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# Training loop 
model.to('cpu') # hell naj

for epoch in range(num_epochs):
    model.train()

    train_loss = 0.0
    train_correct = 0

    tqdm_dataloader = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}, Training')

    for batch in tqdm_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        _, predicted = torch.max(outputs.logits, 1)
        train_correct += (predicted == labels).sum().item()

        tqdm_dataloader.set_postfix({'loss': loss.item(), 'acc': train_correct / len(train_dataloader.dataset)})

    avg_train_loss = train_loss / len(train_dataloader.dataset)
    train_accuracy = train_correct / len(train_dataloader.dataset)

    print(f"\nEpoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            val_correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_dataloader.dataset)
    val_accuracy = val_correct / len(val_dataloader.dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Training:   0%| | 484/158369 [12:39<68:47:21,  1.57s/it, loss=0.515, 


KeyboardInterrupt: 

Yea, since thats not gon happen anytime sonn i go the base with base model itself for prediction.

In [9]:
from transformers import pipeline
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

X_test_list = list(X_test)
# bret pipeline
classifier = pipeline("sentiment-analysis")

predictions_bert = []

# predicting the results
for text in tqdm(X_test_list, desc="Predicting Sentiment"):
    prediction = classifier(text)
    predictions_bert.append(prediction)

# turning labels
predicted_labels = [4 if pred[0]['label'] == 'POSITIVE' else 0 for pred in predictions_bert]

# accuracies on test set using base bert model
accuracy_bert = accuracy_score(y_test, predicted_labels)
report_bert = classification_report(y_test, predicted_labels)

print("Accuracy on test set (BERT):", accuracy_bert)
print("Classification Report (BERT):\n", report_bert)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Predicting Sentiment: 100%|█████████████| 158370/158370 [53:31<00:00, 49.32it/s]


Accuracy on test set (BERT): 0.6757214118835638
Classification Report (BERT):
               precision    recall  f1-score   support

           0       0.64      0.77      0.70     78638
           4       0.72      0.58      0.64     79732

    accuracy                           0.68    158370
   macro avg       0.68      0.68      0.67    158370
weighted avg       0.68      0.68      0.67    158370



For the final model that we had to choose a custom one i have used the roberta model  which was the highest ranked on the huggingface sentiment analysis here are the steps for fine tuning.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from scipy.special import softmax

# Loading Twitter RoBERTa model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Freeze model's parameters
for param in model.parameters():
    param.requires_grad = False

# classification head for fine-tuning
model.classifier = torch.nn.Linear(model.config.hidden_size, 2)  
model.config.num_labels = 2  

# Training params
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# Tokenizing
X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt')
X_val_tokens = tokenizer(list(X_val), padding=True, truncation=True, return_tensors='pt')

# labesls to tensors
y_train = [1 if x == 4 else x for x in y_train]
y_val = [1 if x == 4 else x for x in y_val]
y_train_tensor = torch.tensor(list(y_train))
y_val_tensor = torch.tensor(list(y_val))

# creating dataloaders
train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], y_train_tensor)
val_dataset = TensorDataset(X_val_tokens['input_ids'], X_val_tokens['attention_mask'], y_val_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# training loop
model.to('cpu')

for epoch in range(num_epochs):
    model.train()

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Val
    model.eval()
    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            val_correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_dataloader.dataset)
    val_accuracy = val_correct / len(val_dataloader.dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# Saveing
model.save_pretrained("fine_tuned_twitter_roberta_model")


Aprox 112 hours. I will not lose my cpu for this. I am just using the base model to train it does well.

In [63]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# results
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
outputs = []
for i in list(X_test):
    encoded_input = tokenizer(i, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    if (scores[0]>=scores[2]):
        outputs.append(0)
    else:
        outputs.append(4)
accuracy_hugging = accuracy_score(y_test, outputs)
print("Accuracy on test set (Huggibgface):", accuracy_hugging)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Accuracy on test set (Huggibgface): 0.7317294942223906


Now lets load teh tweets data and use the two best on them for prediction. These are the roberta and bow models. We are saving the result of their predictions for the third part.

In [10]:
# loading the data from part 1
companies = pd.read_csv('companies.csv')
entities = pd.read_csv('entities.csv')
tweets = pd.read_csv('tweets.csv')
users = pd.read_csv('users.csv', on_bad_lines='skip')

  users = pd.read_csv('users.csv', on_bad_lines='skip')


In [11]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# using the functions made in the first part of this part to get the tweets ready
tweets = tweets.dropna()

tweets['clean_text']=  tweets['text'].apply(clean_text)
tweets['preprocessed_text'] = tweets['clean_text'].apply(preprocess_text)

BOW Predictions:

In [70]:
# BOW predictions on tweets
bow_predictions = bow_grid_search.predict(tweets['preprocessed_text'])
tweets['bow_pred'] = [1 if pred == 4 else 0 for pred in bow_predictions]

Roberta Predictions:

In [13]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# results
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# hugging face predictions on tweets
outputs_tweets = []

# predictions
with tqdm(total=len(tweets), desc="Predicting") as pbar:
    for i in tweets['preprocessed_text']:
        encoded_input = tokenizer(i, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        if scores[0] >= scores[2]:
            outputs_tweets.append(0)
        else:
            outputs_tweets.append(1)
        pbar.update(1)
tweets['hugging_pred'] = outputs_tweets

Predicting:   0%|                    | 8282/9091539 [06:34<120:19:23, 20.97it/s]


KeyboardInterrupt: 

Ok ket me stop you there. 120 hours just to get predictions of the model. And thats not with training. You know what? I should have bought an RTX instead of this. But just so you know i could do it heres the proof.

With this being said i only saved the result of the BOW model for the next part.

In [76]:
# saving
tweets.to_csv('tweets_pred.csv')