In [25]:
import pandas as pd
import numpy as np
import re
import nltk
import subprocess
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import namedtuple
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [16]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [7]:
df = pd.read_csv('twitter_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [9]:
df.groupby("class").agg(hate_speech_count=("hate_speech", "mean"),
        offensive_language_count=("offensive_language", "mean"),
        neither_count=("neither", "mean"),).round(1)

Unnamed: 0_level_0,hate_speech_count,offensive_language_count,neither_count
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.3,0.8,0.1
1,0.2,3.0,0.1
2,0.1,0.3,2.8


In [10]:
df = df.drop(columns=["count", "hate_speech", "offensive_language", "neither"])
df.head()

Unnamed: 0.1,Unnamed: 0,class,tweet
0,0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [11]:
df["class"].value_counts(dropna=False, normalize=True).sort_index()

0    0.057701
1    0.774321
2    0.167978
Name: class, dtype: float64

In [12]:
df = df.assign(class_=df["class"].map({
            0:1,
            1:1,
            2:0 })).drop(columns=["class"]).rename(columns={"class_": "class"})


df["class"].value_counts(dropna=False, normalize=True).sort_index()

0    0.167978
1    0.832022
Name: class, dtype: float64

In [13]:
class_map = {
    0: "neutral", 
    1: "offensive and hate speech"
}

## Text Cleaning

In [14]:
def remove_urls(text, replacement_text=""):
    """Remove URLs from string."""
    pattern = re.compile(r"https?://\S+|www\.\S+")
    return pattern.sub(replacement_text, text)


def remove_twitter_handles(text, replacement_text=""):
    """Remove twitter handles from string."""
    pattern = re.compile(r"@[\w]+")
    return pattern.sub(replacement_text, text)


def remove_twitter_rt(text, replacement_text=""):
    """Remove twitter RTs from string."""
    pattern = re.compile(r"^RT|\s+RT\s+")
    return pattern.sub(replacement_text, text)


def remove_alphanumerics(text, replacement_text=" "):
    """Remove alphanumerics from string but leave single quote be."""
    pattern = re.compile(r"[^A-Za-z0-9']+")
    return pattern.sub(replacement_text, text)


def remove_multiple_whitespaces(text, replacement_text=" "):
    """Remove multiple whitespaces from string."""
    pattern = re.compile(r"\s{2,}")
    return pattern.sub(replacement_text, text)


def decode_html_character_references(text):
    """Decode HTML chacarters in string, e.g. &#38; and &amp;."""
    import html
    return html.unescape(text)


df = (df
    .assign(
        tweet_clean=lambda df_: (df_["tweet"]
            .apply(decode_html_character_references)                                 
            .apply(remove_twitter_handles)
            .apply(remove_twitter_rt)
            .apply(remove_urls)
            .apply(remove_alphanumerics)
            .apply(remove_multiple_whitespaces)
            .str.strip()
        )
    )
)

df.tail()

Unnamed: 0.1,Unnamed: 0,tweet,class,tweet_clean
24778,25291,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1,you's a muthaf in lie right His TL is trash No...
24779,25292,"you've gone and broke the wrong heart baby, an...",0,you've gone and broke the wrong heart baby and...
24780,25294,young buck wanna eat!!.. dat nigguh like I ain...,1,young buck wanna eat dat nigguh like I aint fu...
24781,25295,youu got wild bitches tellin you lies,1,youu got wild bitches tellin you lies
24782,25296,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...,0,Ruffled Ntac Eileen Dahlia Beautiful color com...


## Pre-processing

In [17]:
def tokenize(doc):
    return word_tokenize(doc)


def remove_stopwords(doc):
    stops = set(stopwords.words("english"))
    stops.add("rt")
    return [token for token in doc if token not in stops]


def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"ain\'t", "are not", phrase)
    phrase = re.sub(r"shan\'t", "shall not", phrase)
    phrase = re.sub(r"ma\'am", "maam", phrase)
    phrase = re.sub(r"y\'all", "you all", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

lemmatizer = WordNetLemmatizer()

df = (df
    .assign(
        tweet_preprocessed=lambda df_: (
            df_["tweet_clean"]
            .str.lower()
            .apply(lambda doc: [decontracted(word) for word in doc.split(" ")])
            .apply(lambda doc: [lemmatizer.lemmatize(word) for word in doc])
            .apply(lambda doc: " ".join(doc))
            .apply(word_tokenize)
            .apply(remove_stopwords)
       )
    )
)

df.sample(5)

Unnamed: 0.1,Unnamed: 0,tweet,class,tweet_clean,tweet_preprocessed
19081,19506,RT @hawkblogger: The AFC was trash last season...,0,The AFC was trash last season especially the A...,"[afc, wa, trash, last, season, especially, afc..."
21289,21750,Tattoos are about expression not meaning. One ...,0,Tattoos are about expression not meaning One m...,"[tattoo, expression, meaning, one, man, trash,..."
9175,9431,Freak hoes got several,1,Freak hoes got several,"[freak, hoe, got, several]"
13894,14233,Power bomb dat hoe,1,Power bomb dat hoe,"[power, bomb, dat, hoe]"
15566,15931,RT @IINKY_chiefwuk: @ChiefKeef u a bitch short...,1,u a bitch shorty TURNUP OTF,"[u, bitch, shorty, turnup, otf]"


## Train Test Split

In [18]:
y = df["class"].values
x = df["tweet_preprocessed"].values
x = [(" ").join(doc) for  doc in x]

x = np.asarray(x)
y = np.asarray(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.1, random_state=8)

In [19]:
def return_score(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0.0)
    recall = recall_score(y_true, y_pred, average="weighted")
    matrix = confusion_matrix(y_true, y_pred, normalize="true")
    Scores = namedtuple("Scores", ["acc", "f1", "precision", "recall", "matrix"])
    return Scores(acc, f1, precision, recall, matrix)

In [26]:
xgb = XGBClassifier()
vectorizer = CountVectorizer()
ros = RandomOverSampler(random_state=42)

# resample
x_train_res, y_train_res = ros.fit_resample(x_train.reshape(-1, 1), y_train)
x_train_res = x_train_res.flatten()

# vectorize
vectorizer.fit(x_train_res)
x_train_vectorized = vectorizer.transform(x_train_res)
x_test_vectorized = vectorizer.transform(x_test)

# fit and predict
xgb.fit(x_train_vectorized, y_train_res)
y_pred = xgb.predict(x_test_vectorized)
scores = return_score(y_test, y_pred)

print(f"Acc: {scores.acc: .5f} | F1: {scores.f1: .5f} | Precision : {scores.precision: .5f} | Recall: {scores.recall: .5f}")

Acc:  0.94917 | F1:  0.95127 | Precision :  0.95774 | Recall:  0.94917
