### Imports

In [1]:
### Reproducability
import numpy as np
import pandas as pd
random_seed = 42
np.random.seed(random_seed)

### Plotting
import matplotlib.pyplot as plt

### Feature Extractors
from sklearn.feature_extraction.text import TfidfVectorizer

### Models
from sklearn.svm import LinearSVC

### Pipelining
from sklearn.pipeline import Pipeline

### Hyperparameter tuning & Model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

### Persistence
from joblib import dump, load
import json

### nlp
import emoji
import regex as re
import spacy
import nltk as n
from nltk.corpus import wordnet

 
lemmatizer = n.stem.WordNetLemmatizer()
stop_words = set(n.corpus.stopwords.words('english'))
stop_words.add("url_tag")
stop_words.add("person_tag")
nlp = spacy.load("en_core_web_sm")

### Pre-Processing

In [2]:
### Convert POS treebank to POS wordnet (Borrowed from StackOverflow)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [3]:
### Clean Input Text
def clean_text(text):
    text = emoji.demojize(text, delimiters=(" ", " ")).replace("_", " ") ### replce emoji's by words
    text = re.sub(r"https?:\S*", "url_tag", text) ### remove urls
    text = re.sub("@[A-Za-z0-9]+","person_tag",text) ### remove @people mentions
    text = re.sub("#","", text) ### remove # sign for hashtags (some hashtags may be relevant)
    return re.sub('\s+',' ', text).strip() ### return without extra spaces and lowered

In [4]:
### Tokenise, Handle abbreviations, POS tag & Lemmatise, Remove Stop Words

with open('abbreviation.json', 'r') as openfile:
    abbre = json.load(openfile)

def get_tokens(text, abbre):
    tokenized = n.sent_tokenize(text)
    all_tokens = []
    for i in tokenized:
        wordsList = n.word_tokenize(i)

        abbreviation_norm = []
        for i in wordsList:
            if i in abbre:
                abbreviation_norm += [j for j in abbre[i].strip().split()]
            else:
                abbreviation_norm += [i]

        tagged = n.pos_tag(abbreviation_norm)
        lemma = [(lemmatizer.lemmatize(k[0], get_wordnet_pos(k[1])),k[1]) for k in tagged]
        tokens = [t[0].lower()+"_"+t[1] for t in lemma if not t[0] in stop_words] 
        all_tokens += tokens
    return all_tokens

In [5]:
### Read Data
data = pd.read_csv("PATH_TO_CSV.csv")

In [6]:
data["text"] = data["org_text"].apply(lambda x: clean_text(x))
data["tokens"] = data["text"].apply(lambda x: get_tokens(x,abbre))
data["tokens_combined"] = data["tokens"].apply(lambda x: " ".join(x))