In [1]:
import numpy as np 
import pandas as pd 

import os
from pathlib import Path

import string
import nltk
from nltk.corpus import stopwords

import scipy.io
import scipy.linalg
from scipy.sparse import csr_matrix, vstack, lil_matrix 
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import plotly.express as px
import plotly.figure_factory as ff
from yellowbrick.text import TSNEVisualizer

In [16]:
!apt-get install python3.10

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libpython3.10-minimal libpython3.10-stdlib python3.10-minimal
Suggested packages:
  python3.10-venv binfmt-support
The following NEW packages will be installed:
  libpython3.10-minimal libpython3.10-stdlib python3.10 python3.10-minimal
0 upgraded, 4 newly installed, 0 to remove and 49 not upgraded.
Need to get 5,086 kB of archives.
After this operation, 19.5 MB of additional disk space will be used.
Get:1 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/main amd64 libpython3.10-minimal amd64 3.10.5-1+bionic1 [823 kB]
Get:2 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/main amd64 python3.10-minimal amd64 3.10.5-1+bionic1 [1,967 kB]
Get:3 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic/ma

In [19]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def get_data():
    data_path = "/content/drive/MyDrive/Hime project/labeled_data.csv"
    df = pd.read_csv(data_path, index_col=0) 
    df = df.sample(frac=1).reset_index(drop=True)  
    return df 

In [3]:
tdf = get_data()
tdf.head(10)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,3,0,1,u can tell yo bitch we some yung rich homies
1,3,0,3,0,1,' Somebody come look at this niggah right here...
2,3,0,0,3,2,RT @Herman_NYRBlog: Yankees shouldn't even bot...
3,3,0,3,0,1,Subtweettweettweettweet cuz skinny niggah but ...
4,3,0,3,0,1,I hate pumpkin flavored items you other white ...
5,3,0,3,0,1,Ugh I'm such a little bitch &#128514;
6,6,0,6,0,1,@LadyVodkax keep them around so new bitches kn...
7,3,1,2,0,1,Friend zone ass nigguh
8,6,0,6,0,1,&#8220;@_TheCrownedHead: what if I tell y'all ...
9,3,0,3,0,1,"Aint #WCW jus ""bitches i wan fck"" day?"


In [4]:
# Remove stop words, special chars 
# stem the word tokens
# re.sub(r'^https?:\/\/.*[\r\n]*', '', text)
def clean_tweet(sent):
    stemmer = nltk.PorterStemmer()        
    tknzr = nltk.RegexpTokenizer(r'[a-zA-Z0-9]+')

    exclp = list(string.punctuation)     
    exclc = [
        "'re", "n't", "'m", "'s", "n't", "'s", 
        "``", "''", "'ve", "'m", "'ll", "'ve", 
        "...", "http", "https"]    
    sw = set(stopwords.words("english") + exclp + exclc)    

    tokens = tknzr.tokenize(sent.lower())
    words = [stemmer.stem(token) for token in tokens if not token in sw]
    return " ".join(words)     

In [5]:
def clean_tweet_column(df):
    df["tweet"] = df["tweet"].apply(lambda t: clean_tweet(t))
    return df

In [10]:
tdf = clean_tweet_column(tdf)
tdf.head(10)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,3,0,1,u tell yo bitch yung rich homi
1,3,0,3,0,1,somebodi come look niggah right kevin hart voi...
2,3,0,0,3,2,rt herman nyrblog yanke even bother show bosto...
3,3,0,3,0,1,subtweettweettweettweet cuz skinni niggah dick...
4,3,0,3,0,1,hate pumpkin flavor item white bitch crazi nas...
5,3,0,3,0,1,ugh littl bitch 128514
6,6,0,6,0,1,ladyvodkax keep around new bitch know could ne...
7,3,1,2,0,1,friend zone ass nigguh
8,6,0,6,0,1,8220 thecrownedhead tell tri turn bayou classi...
9,3,0,3,0,1,aint wcw ju bitch wan fck day


In [11]:
def get_summary(df):   

    content = df["tweet"].values        
    word_tok = [word.lower() for item in content for word in nltk.word_tokenize(item)]    
    st_words = set(word_tok)   
    
    fact = {
        "TotalCount": len(content),
        "TotalWords": len(word_tok),        
        "TotalUniqueWords": len(st_words),
        "MeanWordsPerTweet": len(word_tok) / len(content),
    }

    return fact, df.describe()

In [20]:
f, s = get_summary(tdf)
s

Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0
mean,3.243473,0.280515,2.413711,0.549247,1.110277
std,0.88306,0.631851,1.399459,1.113299,0.462089
min,3.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,2.0,0.0,1.0
50%,3.0,0.0,3.0,0.0,1.0
75%,3.0,0.0,3.0,0.0,1.0
max,9.0,7.0,9.0,9.0,2.0


In [21]:
f

{'MeanWordsPerTweet': 9.648791510309486,
 'TotalCount': 24783,
 'TotalUniqueWords': 31134,
 'TotalWords': 239126}

In [22]:
def show_wfreq_plot(df, label, labelDescr = ""):   
    xdf = df[df["class"] == label]
    content = xdf["tweet"].values        
    word_tok = [word.lower() for item in content for word in nltk.word_tokenize(item)]    
    st_words = set(word_tok)   
    freq_dist = nltk.FreqDist(word_tok)    
    ls_freq = [(word, frequency) for word, frequency in freq_dist.most_common(20)]
    twdf = pd.DataFrame(ls_freq, columns=["Word", "Frequency"])
    tfig = px.bar(twdf, x="Word", y="Frequency", title="Top 20 most frequent words - " + labelDescr)
    tfig.show()  

In [23]:
show_wfreq_plot(tdf, 0, "hate speech")

In [24]:
show_wfreq_plot(tdf, 1, "offensive")

In [25]:
show_wfreq_plot(tdf, 2, "neither")

In [26]:
def show_tsne_plot(df):   
    tknzr = nltk.RegexpTokenizer(r'[a-zA-Z0-9]+')
    sents = df["tweet"].values
    labels = np.array(df["class"].values)
    vcrz = TfidfVectorizer(lowercase=True,stop_words='english',                      
                        analyzer="word",        
                        max_features=5000,                
                        tokenizer = tknzr.tokenize)     
    sents_vals = vcrz.fit_transform(sents)
    tsne = TSNEVisualizer(labels=[0,1,2])
    tsne.fit(sents_vals, labels)
    tsne.show()

In [27]:
show_tsne_plot(tdf)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



KeyboardInterrupt: ignored

In [28]:
def get_model_tfidf(df, fset=["c", "w"], max_feats=5000, ngram_range = (1,3)):   
    tknzr = nltk.RegexpTokenizer(r'[a-zA-Z0-9]+')
    sents = df["tweet"].values
    labels = np.array(df["class"].values)
    features = []

    if "w" in fset:
        wvcrz = TfidfVectorizer(lowercase=True,stop_words='english',
                            ngram_range = ngram_range,
                            analyzer="word",        
                            max_features=max_feats,                
                            tokenizer = tknzr.tokenize)        
        features.append(('wvect_features', Pipeline([("wvect", wvcrz)])))  
        
    if "c" in fset:
        cvcrz = TfidfVectorizer(lowercase=True,stop_words='english',
                            ngram_range = ngram_range,
                            analyzer="char",        
                            max_features=max_feats,                
                            tokenizer = tknzr.tokenize)        
        features.append(('cvect_features', Pipeline([("cvect", cvcrz)])))         
         
    merger = FeatureUnion(features)                    
    sents_vals = merger.fit_transform(sents)    
    sents_vals_lil = lil_matrix(sents_vals) 
    
    return sents_vals_lil, labels

In [29]:
def show_sim_plot(df):   
    
    lil_mat, labels = get_model_tfidf(df, fset=["w"])
    hate_vals = lil_mat[labels == 0,:]
    off_vals = lil_mat[labels == 1,:] 
    neit_vals = lil_mat[labels == 2,:]    

    sim_matrices = [
        ("Hate", 'In Class', cosine_similarity(hate_vals, hate_vals)),
        ("Offensive", 'In Class', cosine_similarity(off_vals, off_vals)),
        ("Neither",'In Class', cosine_similarity(neit_vals, neit_vals)),
        ("Hate v Offensive",'Other Class', cosine_similarity(hate_vals, off_vals)),
        ("Offensive v Neither",'Other Class', cosine_similarity(off_vals, neit_vals)),
        ("Neither v Hate",'Other Class', cosine_similarity(hate_vals, neit_vals)),        
    ]

    scores = []

    for lab, group, score_matrix in sim_matrices:
        sdf = pd.DataFrame(score_matrix)
        sdf.replace(0, np.nan, inplace=True)
        sdf.replace(1, np.nan, inplace=True)
        sdf["max"] = sdf.max(axis=1)
        score = sdf["max"].mean()
        scores.append((lab, group, score))

    simdf = pd.DataFrame(scores, columns=["Label", "Grouping", "Score"])
    tfig = px.bar(simdf[simdf["Grouping"] == "In Class"], x="Label", y="Score", color="Label", title="In class similarities")
    tfig.show()      
    
    tfig = px.bar(simdf[simdf["Grouping"] == "Other Class"], x="Label", y="Score", color="Label", title="Out of class similarities")
    tfig.show()     

In [None]:
show_sim_plot(tdf)

In [31]:
def get_train_test(df, fset=["c", "w"], tsize=0.25):
    
    x_values, y_values = get_model_tfidf(df, fset=fset, max_feats=10000, ngram_range = (1,3))
    
    sel_mod = SelectFromModel(
        LogisticRegression(penalty='l2', solver='saga', multi_class='multinomial', random_state=1), 
        threshold=-np.inf)
    sel_feats = sel_mod.fit_transform(x_values, y_values) 
    
    x_train, x_test, y_train, y_test = train_test_split(
        sel_feats, y_values, test_size=tsize, random_state=1, stratify=y_values
    )      
    return x_train, x_test, y_train, y_test  

In [32]:
def run_logreg(df):     
    x_train, x_test, y_train, y_test = get_train_test(df)     
    
    clf = LogisticRegression(penalty="none", random_state=0, solver='saga')      
    clf.fit(x_train, y_train)  
    predicted = clf.predict(x_test)           
    return clf, predicted, y_test

def run_linsvm(df):     
    x_train, x_test, y_train, y_test = get_train_test(df)     
    
    clf = LinearSVC()       
    clf.fit(x_train, y_train)  
    predicted = clf.predict(x_test)           
    return clf, predicted, y_test

def run_mlptron(df):     
    x_train, x_test, y_train, y_test = get_train_test(df)     
    
    clf = MLPClassifier()       
    clf.fit(x_train, y_train)  
    predicted = clf.predict(x_test)           
    return clf, predicted, y_test

In [33]:
def show_results(predicted, y_test, labels=[0,1,2],  label_descr=["Hate", "Offensive", "Neither"]):
    clsr = classification_report(y_test, predicted, target_names=labels, output_dict=True)
    cm = confusion_matrix(y_test, predicted, labels=labels)   
    
    cr_df = pd.DataFrame(clsr).transpose()    
    print(cr_df)
    
    fig = ff.create_annotated_heatmap(cm, x=label_descr, y=label_descr)
    fig.update_layout(title_text='Confusion Matrix')
    fig.show()

In [34]:
clf, pred, y_test = run_logreg(tdf)
show_results(pred, y_test)

              precision    recall  f1-score      support
0              0.410359  0.288515  0.338816   357.000000
1              0.924536  0.944769  0.934543  4798.000000
2              0.840691  0.841499  0.841095  1041.000000
accuracy       0.889606  0.889606  0.889606     0.889606
macro avg      0.725195  0.691594  0.704818  6196.000000
weighted avg   0.880823  0.889606  0.884518  6196.000000



The max_iter was reached which means the coef_ did not converge



In [35]:
clf, pred, y_test = run_linsvm(tdf)
show_results(pred, y_test)

              precision    recall  f1-score      support
0              0.492537  0.277311  0.354839   357.000000
1              0.925844  0.954981  0.940187  4798.000000
2              0.843212  0.847262  0.845232  1041.000000
accuracy       0.897837  0.897837  0.897837     0.897837
macro avg      0.753864  0.693185  0.713419  6196.000000
weighted avg   0.886994  0.897837  0.890507  6196.000000


In [36]:
clf, pred, y_test = run_mlptron(tdf)
show_results(pred, y_test)

              precision    recall  f1-score      support
0              0.411321  0.305322  0.350482   357.000000
1              0.917680  0.943310  0.930319  4798.000000
2              0.829830  0.796350  0.812745  1041.000000
accuracy       0.881859  0.881859  0.881859     0.881859
macro avg      0.719610  0.681661  0.697849  6196.000000
weighted avg   0.873745  0.881859  0.877156  6196.000000
