## REFERENCES:

https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py - emoticons, their meanings and unicodes
https://www.kaggle.com/code/sndpkirwai/cyberbully-detection-text-classification - code example

## UNZIPPING DATASETS

In [1]:
zip_path = r'archive.zip'
path = r'archive/'
import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(path)

## LIBRARIES

In [25]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn.metrics import classification_report

import torch
from torch.utils.data import TensorDataset, random_split

import spacy
nlp = spacy.load("en_core_web_sm")

import time

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv(r'archive/twitter_parsed_dataset.csv')
df

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.74948705591165E+017,5.74948705591165E+017,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,5.71917888690393E+017,5.71917888690393E+017,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,3.90255841338601E+017,3.90255841338601E+017,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,5.68208850655916E+017,5.68208850655916E+017,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,5.75596338802373E+017,5.75596338802373E+017,#mkr No No No No No No,none,0.0
...,...,...,...,...,...
16846,5.75606766236475E+017,5.75606766236475E+017,"Feeling so sorry for the girls, they should be...",none,0.0
16847,5.72333822886326E+017,5.72333822886326E+017,#MKR 'pretty good dishes we're happy with' - O...,none,0.0
16848,5.72326950057845E+017,5.72326950057845E+017,RT @colonelkickhead: Deconstructed lemon tart!...,none,0.0
16849,5.74799612642357E+017,5.74799612642357E+017,@versacezaynx @nyazpolitics @greenlinerzjm You...,none,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16851 entries, 0 to 16850
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       16851 non-null  object 
 1   id          16850 non-null  object 
 2   Text        16850 non-null  object 
 3   Annotation  16848 non-null  object 
 4   oh_label    16848 non-null  float64
dtypes: float64(1), object(4)
memory usage: 658.4+ KB


In [5]:
df.columns = df.columns.str.lower()
df["text"] = df["text"].astype(str)
df.drop(["index","id"], axis=1, inplace=True)

In [6]:
df

Unnamed: 0,text,annotation,oh_label
0,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,#mkr No No No No No No,none,0.0
...,...,...,...
16846,"Feeling so sorry for the girls, they should be...",none,0.0
16847,#MKR 'pretty good dishes we're happy with' - O...,none,0.0
16848,RT @colonelkickhead: Deconstructed lemon tart!...,none,0.0
16849,@versacezaynx @nyazpolitics @greenlinerzjm You...,none,0.0


## DATA CLEANING

In [7]:
def read_dict_from_file(filename, file):
    filename = {}
    with open(file, 'r', encoding = 'utf8') as f:
        for line in f:
            key, value = line.strip().split('\t')
            filename[key.strip()] = value.strip()
    return filename

In [8]:
emo = {}
uni = {}
read_dict_from_file(emo, 'emo.txt')
read_dict_from_file(uni, 'uni.txt')

{':1st_place_medal:': '🥇',
 ':2nd_place_medal:': '🥈',
 ':3rd_place_medal:': '🥉',
 ':AB_button_(blood_type):': '🆎',
 ':ATM_sign:': '🏧',
 ':A_button_(blood_type):': '🅰',
 ':Afghanistan:': '🇦🇫',
 ':Albania:': '🇦🇱',
 ':Algeria:': '🇩🇿',
 ':American_Samoa:': '🇦🇸',
 ':Andorra:': '🇦🇩',
 ':Angola:': '🇦🇴',
 ':Anguilla:': '🇦🇮',
 ':Antarctica:': '🇦🇶',
 ':Antigua_&_Barbuda:': '🇦🇬',
 ':Aquarius:': '♒',
 ':Argentina:': '🇦🇷',
 ':Aries:': '♈',
 ':Armenia:': '🇦🇲',
 ':Aruba:': '🇦🇼',
 ':Ascension_Island:': '🇦🇨',
 ':Australia:': '🇦🇺',
 ':Austria:': '🇦🇹',
 ':Azerbaijan:': '🇦🇿',
 ':BACK_arrow:': '🔙',
 ':B_button_(blood_type):': '🅱',
 ':Bahamas:': '🇧🇸',
 ':Bahrain:': '🇧🇭',
 ':Bangladesh:': '🇧🇩',
 ':Barbados:': '🇧🇧',
 ':Belarus:': '🇧🇾',
 ':Belgium:': '🇧🇪',
 ':Belize:': '🇧🇿',
 ':Benin:': '🇧🇯',
 ':Bermuda:': '🇧🇲',
 ':Bhutan:': '🇧🇹',
 ':Bolivia:': '🇧🇴',
 ':Bosnia_&_Herzegovina:': '🇧🇦',
 ':Botswana:': '🇧🇼',
 ':Bouvet_Island:': '🇧🇻',
 ':Brazil:': '🇧🇷',
 ':British_Indian_Ocean_Territory:': '🇮🇴',
 ':British_Virgin_Is

In [9]:
def convert_emoticons(text):
    for emot in emo:
        text = re.sub(u'('+emot+u')', "_".join(emo[emot].replace(",","").split()), text)
    return text

def convert_emojis(text):
    for emot in uni:
        text = re.sub(r'('+emot+')', "_".join(uni[emot].replace(",","").replace(":","").split()), text)
    return text

with open('abbr.txt', 'r') as f:
    abbrs = f.read()

In [10]:
abbr_dict = {}
abbr_list = []
for line in abbrs.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        abbr_list.append(cw)
        abbr_dict[cw] = cw_expanded
abbr_list = set(abbr_list)

def abbr_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in abbr_list:
            new_text.append(abbr_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [11]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [12]:
def cleaning(data):
    data = convert_emoticons(data)
    data = convert_emojis(data)
    data = abbr_conversion(data)
    data = data.lower()
    data = re.sub(r"\S*https?:\S*", '', data)
    data = re.sub('\w*\d\w', '', data)
    data = re.sub(r'\n', ' ', data)
    data = re.sub("[''""...“”‘’…@]", '', data)
    data =" ".join([word for word in str(data).split() if word not in stop_words])
    data = lemmatize_words(data)
    return data

In [13]:
df['cleaned'] = df['text'].apply(lambda x: cleaning(x))
# df['cleaned'] = df['cleaned'].apply(lambda x: nltk.word_tokenize(x))
df

Unnamed: 0,text,annotation,oh_label,cleaned
0,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0,halalflaws biebervalue greenlinerzjm read cont...
1,@ShreyaBafna3 Now you idiots claim that people...,none,0.0,shreyabafna3 idiot claim people try stop becom...
2,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0,"rt mooseoftorment call sexist, go auto place, ..."
3,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,"sipsquirrelx wrong, isi follow example mohamme..."
4,#mkr No No No No No No,none,0.0,#mkr
...,...,...,...,...
16846,"Feeling so sorry for the girls, they should be...",none,0.0,"feel sorry girls, safe kat andre go home #mkr"
16847,#MKR 'pretty good dishes we're happy with' - O...,none,0.0,"#mkr pretty good dish happy - ok, well im neve..."
16848,RT @colonelkickhead: Deconstructed lemon tart!...,none,0.0,rt colonelkickhead: deconstruct lemon tart!can...
16849,@versacezaynx @nyazpolitics @greenlinerzjm You...,none,0.0,versacezaynx nyazpolitics greenlinerzjm stupid...


In [19]:
df.drop(["text"], axis=1, inplace=True)
df.dropna(inplace=True)
df.to_csv('data.csv', index = False)

## LOGREG

In [21]:
X = df['cleaned']
y = df['oh_label']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
clf = LogisticRegression(max_iter = 1000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [23]:
y_pred = clf.predict(X_test)
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [26]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.86      0.93      0.89      2306
         1.0       0.82      0.67      0.74      1064

    accuracy                           0.85      3370
   macro avg       0.84      0.80      0.82      3370
weighted avg       0.85      0.85      0.84      3370

