### Setup

In [5]:
%%capture
!pip install transformers textblob
!pip install nltk
!pip install accelerate -U
!pip install torch


In [1]:
import re
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
dataset_path = "dataset - modified - equal - v3.csv"

In [3]:
import pandas as pd
df = pd.read_csv(dataset_path)
df

Unnamed: 0,hate_speech_count,tweet
0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
1,1,""" @rhythmixx_ :hobbies include: fighting Maria..."
2,1,""" bitch get up off me """
3,1,""" bitch who do you love """
4,1,""" these hoes like niggas that spend money not ..."
...,...,...
10041,0,&#8220;@chloeonvine: Are. You. Kidding. Me. Ri...
10042,0,&#8220;@chloeonvine: im just a sarcastic lil b...
10043,0,&#8220;@chrisbrown: These hoes ain't loyal [i ...
10044,0,&#8220;@clzcdxx: im just a bitch ass elf&#8221...


### **Dataset-preprocessing**

In [4]:
#Data Cleaning
def remove_unwanted_text(content):
    '''
    Removes unwanted text from content using regex
    Input
    content: A string
    Output
    final: the final parsed string
    '''
    #\s is white space, ^ is a negation of the charecter set [], + means followed by any charecter specified
    handle = re.sub('@[^\s]+', '', content)
    link = re.sub('http[^\s]+', '', handle)
    link = re.sub('www[^\s]+', '', link)
    ht = re.sub('#[^\s]+', '', link)
    final = re.sub('&[^\s]+', '', ht)
    return final

def remove_punctuations(words):
    '''
    Input
    words: A list of words to be processed
    Output
    returns a list of words that punctuations and numbers have been removed.
    '''
    new_words = []
    for w in words:
        l = re.sub('[^A-Za-z ]+', '', w)
        if l != '':
            new_words.append(l)
    return new_words

def remove_stop_words(words):
    '''
    Input
    words: Words to be processed
    Output
    returns a list of words without English stopwords
    '''
    sw = stopwords.words("english") # English Stop Words
    #Make sure that the stopwords also dont have punctioations
    sw = remove_punctuations(sw)
    return [w for w in words if w.lower() not in sw]

def stem_words(words):
    '''
    Input
    words: A list of words to be stemmed
    Output
    returns a list of stemmed words
    '''
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

def lemmatize_words(words):
    '''
    Input
    words: A list of words to be lemmatized
    Output
    returns a list of lemmatized words
    '''
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def clean_data(content):


    '''
    Cleans the incoming data
    Input
    content: a DataFrame series
    '''
    cleaned_data = []
    for i in range(len(content)):
        #by iterating, we are using every row in clean_tweet as a string and we pass it to the cleaning functions
        tweet = remove_unwanted_text(content[i].lower())
        tb = TextBlob(tweet) #Tokenize the tweet after removing unwanted text
        #tb is a textblob object, we use tb.words to access the tokenized words in it as a list of words
        words = remove_punctuations(tb.words)
        # stemmed_words = stem_words(words)  # Stem the words
        lemmatized_words = lemmatize_words(words)
        final_words = remove_stop_words(lemmatized_words)
        cleaned_data.append(" ".join(final_words)) #join the words as a string and append it
    return cleaned_data

In [5]:
# Clean the 'tweet' column and add it in a new row "clean_tweet"
df['clean_tweet'] = clean_data(df['tweet'])
df['clean_tweet'][9]

'ppl talk bad ghettohood kid growing nigga funnnnnnn'

### **Converting to a hugging face dataset**

In [6]:
%%capture
!pip install datasets

In [7]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['hate_speech_count', 'tweet', 'clean_tweet'],
    num_rows: 10046
})

In [12]:
dataset[1]

{'hate_speech_count': 1,
 'tweet': '" @rhythmixx_ :hobbies include: fighting Mariam"\n\nbitch',
 'clean_tweet': 'hobby include fighting mariam bitch'}

### **Tokenizing the dataset**

In [13]:
from transformers import AutoTokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [14]:
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["clean_tweet"], padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs["label"] = examples["hate_speech_count"]
    return tokenized_inputs
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10046 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
tokenized_dataset[1]

{'hate_speech_count': 1,
 'tweet': '" @rhythmixx_ :hobbies include: fighting Mariam"\n\nbitch',
 'clean_tweet': 'hobby include fighting mariam bitch',
 'input_ids': [0,
  298,
  27825,
  680,
  2190,
  4401,
  6009,
  32594,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'label': 1}

### **Splitting the dataset**

In [38]:
train_val_split = tokenized_dataset.train_test_split(test_size=0.2)
test_dataset = train_val_split['test']
train_val_dataset = train_val_split['train']

# Further split the train_val_dataset into train and validation sets
train_val_split = train_val_dataset.train_test_split(test_size=0.25)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

In [39]:
# Print to verify splits
print(f"Train dataset length: {len(train_dataset)}")
print(f"Validation dataset length: {len(val_dataset)}")
print(f"Test dataset length: {len(test_dataset)}")

Train dataset length: 6027
Validation dataset length: 2009
Test dataset length: 2010


### **Loading the model**

In [40]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2, ignore_mismatched_sizes=True)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

### **Training the model**

In [41]:
from transformers import TrainingArguments, DataCollatorWithPadding, AdamW
import torch
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.3
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate= 2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=14,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='precision',
)



In [42]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall':recall,
        'f1': f1,
    }

In [43]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  
    compute_metrics=compute_metrics,
    optimizers=(optimizer,None),  
)


In [44]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.635438,0.74216,0.759635,0.74216,0.738497
2,0.569200,0.5091,0.769537,0.772642,0.769537,0.769133
3,0.487400,0.478878,0.792434,0.792926,0.792434,0.792245
4,0.461500,0.504087,0.792434,0.792524,0.792434,0.792446
5,0.461500,0.490903,0.797909,0.800254,0.797909,0.797319
6,0.436900,0.525475,0.794425,0.794787,0.794425,0.794275
7,0.421200,0.532209,0.784968,0.788267,0.784968,0.784581
8,0.400000,0.50932,0.792932,0.792955,0.792932,0.792893
9,0.400000,0.488849,0.794425,0.797596,0.794425,0.793644
10,0.386900,0.497639,0.800398,0.800693,0.800398,0.800274


TrainOutput(global_step=5278, training_loss=0.4223957791749435, metrics={'train_runtime': 1042.5865, 'train_samples_per_second': 80.931, 'train_steps_per_second': 5.062, 'total_flos': 2287501904185680.0, 'train_loss': 0.4223957791749435, 'epoch': 14.0})

In [35]:
# Evaluate the model on the validation dataset
evaluation_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation Results:", evaluation_results)

# Optionally, evaluate the model on the test dataset
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test Results:", test_results)

Validation Results: {'eval_loss': 0.8049112558364868, 'eval_accuracy': 0.8242906918865107, 'eval_precision': 0.8251956179511837, 'eval_recall': 0.8242906918865107, 'eval_f1': 0.8242522912568999, 'eval_runtime': 5.6822, 'eval_samples_per_second': 353.56, 'eval_steps_per_second': 22.174, 'epoch': 5.0}
Test Results: {'eval_loss': 0.8430566787719727, 'eval_accuracy': 0.8393034825870647, 'eval_precision': 0.8398028620336193, 'eval_recall': 0.8393034825870647, 'eval_f1': 0.8392367487313183, 'eval_runtime': 5.7312, 'eval_samples_per_second': 350.712, 'eval_steps_per_second': 21.985, 'epoch': 5.0}


In [36]:
# Save the trained model
model.save_pretrained('./trained_roBERTa_model')
tokenizer.save_pretrained('./trained_roBERTa_model')

('./trained_roBERTa_model/tokenizer_config.json',
 './trained_roBERTa_model/special_tokens_map.json',
 './trained_roBERTa_model/vocab.json',
 './trained_roBERTa_model/merges.txt',
 './trained_roBERTa_model/added_tokens.json',
 './trained_roBERTa_model/tokenizer.json')

In [33]:

# Load the saved tokenizer
tokenizer = AutoTokenizer.from_pretrained('./trained_roBERTa_model')

# Load the saved model
model = AutoModelForSequenceClassification.from_pretrained('./trained_roBERTa_model')

# Example usage: Tokenize a new input and make a prediction
inputs = tokenizer("This is very hateful.", return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(-1)
print("Predicted class:", predictions.item())


Predicted class: 1


In [45]:
import shutil

# Path to the directory you want to delete
directory_path = './results'

# Delete the directory
try:
    shutil.rmtree(directory_path)
    print(f'Directory {directory_path} has been deleted successfully.')
except FileNotFoundError:
    print(f'Directory {directory_path} does not exist.')
except Exception as e:
    print(f'An error occurred: {e}')



Directory ./results has been deleted successfully.
