<a href="https://colab.research.google.com/github/StefaniaGutu/OffensiveLanguageClassification/blob/main/NLP_Project_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
import string
import spacy
import pickle
from os.path import exists
from gensim.models import Word2Vec as w2v
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report
import math

Citire dataset

In [15]:
df = pd.read_csv('sample_data/labeled_data.csv')
dataset_labels = df["class"]
dataset_tweets = df["tweet"]
print(dataset_tweets[:5])
print(dataset_labels[:5])

0    !!! RT @mayasolovely: As a woman you shouldn't...
1    !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2    !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3    !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4    !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
Name: tweet, dtype: object
0    2
1    1
2    1
3    1
4    1
Name: class, dtype: int64


Preprocesare

In [16]:
def preprocess(dataset):
    # eliminare @user
    tweets_no_user = [re.sub(r'@[\w]*', '', tweet) for tweet in dataset_tweets]

    # eliminare URL
    tweets_no_links = [re.sub(r'http\S+', '', tweet) for tweet in tweets_no_user]

    # eliminare hashtags
    tweets_no_hashtags = [re.sub(r'#[a-zA-Z0-9_]+','', tweet) for tweet in tweets_no_links]

    # eliminare &_
    tweets_no_ampersand = [re.sub(r'&[a-zA-Z0-9_]+','', tweet) for tweet in tweets_no_hashtags]

    # eliminare caractere speciale
    tweets_no_punct = [tweet.translate(str.maketrans('', '', string.punctuation)) for tweet in tweets_no_ampersand]

    # eliminare cifre
    tweets_no_digits = [re.sub(r'\d+', '', tweet) for tweet in tweets_no_punct]

    # transformare caractere in litere mici
    tweets_lower = [tweet.lower() for tweet in tweets_no_digits]

    # inlocuire secvente repetitive cu aceeasi litera cu 2 doua repetitii a acesteia
    # e.g. 'wooooow' -> 'woow'
    tweets_no_repetitions = [re.sub(r'(.)\1+', r'\1\1', tweet) for tweet in tweets_lower]

    # inlocuire multiple caractere albe cu un singur caracter alb
    tweets_no_spaces = [re.sub(r'\s+', ' ', tweet) for tweet in tweets_no_repetitions]

    # lematizare
    nlp = spacy.load('en_core_web_sm')
    all_words_lemma = [[word.lemma_ for word in nlp(sent)] for sent in tweets_no_spaces]

    # eliminare stopwords
    stop_words_spacy = nlp.Defaults.stop_words
    all_words_without_stops = [[word for word in sent if word not in stop_words_spacy] for sent in all_words_lemma]

    # eliminare cuvinte cu mai putin de doua caractere
    words_no_shortWords = [[word for word in tweet if len(word)>2] for tweet in all_words_without_stops]
    tweets_no_shortWords = [word for word in words_no_shortWords]

    return tweets_no_shortWords

In [17]:
preprocessed_dataset = preprocess(dataset_tweets)
preprocessed_dataset[:15]

[['woman', 'complain', 'clean', 'house', 'man', 'trash'],
 ['boy', 'dat', 'coldtyga', 'dwn', 'bad', 'cuffin', 'dat', 'hoe', 'place'],
 ['dawg', 'fuck', 'bitch', 'start', 'cry', 'confuse', 'shit'],
 ['look', 'like', 'tranny'],
 ['shit', 'hear', 'true', 'faker', 'bitch', 'tell'],
 ['shit', 'blow', 'meclaim', 'faithful', 'somebody', 'fuck', 'hoe'],
 ['sit', 'hate', 'bitch', 'shit'],
 ['cause', 'tired', 'big', 'bitch', 'come', 'skinny', 'girl'],
 ['bitch'],
 ['hobby', 'include', 'fight', 'mariam', 'bitch'],
 ['keek', 'bitch', 'curve', 'lol', 'walk', 'conversation', 'like', 'smh'],
 ['murda', 'gang', 'bitch', 'gang', 'land'],
 ['hoe', 'smoke', 'loser', 'yea'],
 ['bad', 'bitch', 'thing', 'like'],
 ['bitch']]

In [18]:
# salvare date preprocesate
pck = open("./pickles/preprocessed_dataset.pck", "wb")
pickle.dump({
    'preprocessed_dataset': preprocessed_dataset,
    'dataset_labels': dataset_labels
}, pck)

In [20]:
# incarcare date preprocesate
dataset_tweets = []
dataset_labels = []

pck = open("./pickles/preprocessed_dataset.pck", "rb")
result = pickle.load(pck)
dataset_tweets = result['preprocessed_dataset']
dataset_labels = result['dataset_labels']

Extragere features

In [21]:
# Label-uri originale
# 0 - hate speech 
# 1 - offensive language 
# 2 - neither

# Transformare New Labels
# 0 - hate speech & offensive language 
# 1 - normal speech

new_dataset_labels = []
for i in dataset_labels:
  if i == 1:
    new_dataset_labels.append(0)
  elif i == 2:
    new_dataset_labels.append(1)
  else:
    new_dataset_labels.append(i)

hate_speech = [data for data in new_dataset_labels if data == 0]
print('No. of hate speech tweets: ', len(hate_speech))
print('No. of normal speech tweets: ', len(new_dataset_labels) - len(hate_speech))

No. of hate speech tweets:  20620
No. of normal speech tweets:  4163


Vectorizare tweets

In [22]:
w = w2v(
    dataset_tweets,
    min_count=3,
    sg=1,  # folosire skip-gram
    window=7
)

emb_df = (
    pd.DataFrame(
        [w.wv.get_vector(str(n)) for n in w.wv.key_to_index],
        index=w.wv.key_to_index
    )
)

print("Snippet of vectorized dataset:")
emb_df.head()

Snippet of vectorized dataset:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
bitch,-0.253815,0.363812,0.082159,0.250147,0.144317,-0.289422,0.25678,0.570614,-0.387734,-0.183745,...,-0.012797,-0.013084,0.179199,0.052755,0.701132,0.023203,-0.260612,-0.138701,0.016032,0.231703
hoe,-0.235722,0.233416,0.030501,0.266416,0.244366,-0.395659,0.10144,0.444944,-0.364026,-0.217979,...,-0.040352,0.066378,0.190805,0.045151,0.574508,0.006402,-0.057906,-0.124974,-0.199931,0.184724
like,-0.153898,0.061575,-0.253794,-0.110359,0.124142,-0.20906,0.06355,0.518869,-0.42641,-0.081765,...,0.002953,0.258107,0.308775,-0.1391,0.486553,0.304533,0.130024,0.088588,0.171412,0.030224
pussy,-0.362276,0.19924,-0.030405,0.229053,0.186614,-0.283849,0.039137,0.390505,-0.212662,-0.269934,...,0.033874,0.068764,0.067731,-0.06614,0.721412,0.010455,0.08976,-0.27373,-0.179517,0.028367
fuck,-0.072211,0.071446,-0.162912,0.033709,0.084598,-0.340934,0.131103,0.515549,-0.251116,-0.126571,...,0.252354,0.161649,0.032366,0.085612,0.463884,0.111749,-0.046471,-0.014765,0.005055,0.066733


In [23]:
vectorized_sentences = [] 
vectorized_labels = []

for i, sentence in enumerate(dataset_tweets):
    sentence_vectors = []
    for word in sentence:
        try:
            word_vector = w.wv.get_vector(word)
            word_vector = word_vector[~np.isnan(word_vector)]
            sentence_vectors.append(np.mean(word_vector))
        except KeyError:
            # If a word is not in the vocabulary, ignore it
            pass
    try:
      vectorized_sentence = sentence_vectors
      vectorized_sentences.append(vectorized_sentence)
      vectorized_labels.append(new_dataset_labels[i])
    except ZeroDivisionError:
      pass

print("Word vectorization for every sentence:")
print(vectorized_sentences[:5])
print("Labels for the above sentences: ", vectorized_labels[:5])

Word vectorization for every sentence:
[[0.015738519, 0.00538588, 0.0068075173, 0.005401041, 0.012162268, -0.010240559], [0.0008741776, 0.003449244, 0.012828689, 0.005233385, 0.003449244, 0.0010927176, 0.004908672], [0.005889101, 0.0012296453, -0.0026917816, 0.0024945466, 0.00858413, 0.005145441, -0.008688549], [0.0024914788, 0.009757325, 0.0034156698], [-0.008688549, 0.006457069, 0.0073863934, 0.004063918, -0.0026917816, 0.010578543]]
Labels for the above sentences:  [1, 0, 0, 0, 0]


Adaugare padding

In [24]:
# padding every tweet to the longest sentence
padded_tweets = pad_sequence([torch.tensor(tweet) for tweet in vectorized_sentences], batch_first=True, padding_value=0)

Separarea datelor in date de antrenare, testare si validare

In [25]:
X_train, X_val_test, y_train, y_val_test = train_test_split(padded_tweets, vectorized_labels, test_size=0.30, random_state=42)
X_test, X_validation, y_test, y_validation = train_test_split(X_val_test, y_val_test, test_size=0.50, random_state=42)

print("No. of train tweets: ", len(X_train))
print("No. of test tweets: ", len(X_test))
print("No. of validation tweets: ", len(X_validation))

No. of train tweets:  17348
No. of test tweets:  3717
No. of validation tweets:  3718


Model

In [None]:
# Convertire date in tensori PyTorch

X_train_tensor = torch.tensor(X_train)
X_val_tensor = torch.tensor(X_validation)
X_test_tensor = torch.tensor(X_test)
y_train_tensor = torch.tensor(y_train)
y_val_tensor = torch.tensor(y_validation)
y_test_tensor = torch.tensor(y_test)

train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

In [27]:
# dataloaders
batch_size = 50  
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [35]:
# definire model RNN
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(x.shape[0], 1, -1)  # reshape la (batch_size, sequence_length, input_size)
        out, _ = self.rnn(x)
        out = out[:, -1, :]  # get the last output of the sequence 
        out = self.fc(out)
        return out

model = RNN(len(X_train[0]), 32, 2)  # clasificare binara (hate_speech & offensive_language vs. normal_speech)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

Antrenare si validare model

In [36]:
epochs_no = 100
for epoch in range(epochs_no):  # antrenare model in epoci 
    # antrenare
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

    # validare
    model.eval()
    
    val_losses = []
    for inputs, labels in val_loader:
        with torch.no_grad():
            val_outputs = model(inputs.float())
            val_loss = criterion(val_outputs, labels.long())
        val_losses.append(val_loss.item())
    print(f'Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {sum(val_losses)/len(val_losses)}')

Epoch 1, Loss: 0.4505457580089569, Val Loss: 0.453272541364034
Epoch 2, Loss: 0.45080772042274475, Val Loss: 0.4558661961555481
Epoch 3, Loss: 0.2904714345932007, Val Loss: 0.4557308852672577
Epoch 4, Loss: 0.5444226861000061, Val Loss: 0.4570409647623698
Epoch 5, Loss: 0.45028868317604065, Val Loss: 0.4542970883846283
Epoch 6, Loss: 0.40892985463142395, Val Loss: 0.45400386571884155
Epoch 7, Loss: 0.48473212122917175, Val Loss: 0.45503403425216676
Epoch 8, Loss: 0.38624823093414307, Val Loss: 0.45175528864065806
Epoch 9, Loss: 0.5118669271469116, Val Loss: 0.45222827792167664
Epoch 10, Loss: 0.3394610583782196, Val Loss: 0.4548978014787038
Epoch 11, Loss: 0.4445559084415436, Val Loss: 0.4510106384754181
Epoch 12, Loss: 0.5202204585075378, Val Loss: 0.4524947182337443
Epoch 13, Loss: 0.5469512343406677, Val Loss: 0.4514547634124756
Epoch 14, Loss: 0.4007445275783539, Val Loss: 0.4520122786362966
Epoch 15, Loss: 0.4480588436126709, Val Loss: 0.45232248028119404
Epoch 16, Loss: 0.3798673

Salvare model

In [37]:
def saveBestModel():
    torch.save(model.state_dict(), "model/best_model.pt")

In [38]:
saveBestModel()

In [32]:
def loadBestModel():
  try:
      model.load_state_dict(torch.load("model/best_model.pt"))
      return True
  except:
      return False

Testare model

In [None]:
loadBestModel()
model.eval()

correct = 0
total = 0
predictions = []
labelsTest = []
for inputs, labels in test_loader:
    with torch.no_grad():
        test_outputs = model(inputs.float())
        _, predicted = torch.max(test_outputs, 1)
        total += labels.size(0)
        predictions.extend(predicted)
        labelsTest.extend(labels.long())
        correct += (predicted == labels.long()).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy}')
print(classification_report(labelsTest, predictions))