In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re
import json
from sklearn.metrics import confusion_matrix
import pickle

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT

# filepath = "finalized_8K_accounts.csv"
# filepath = "finalized_8K_accounts_emojis_replaced.csv"
# filepath = "FINALIZED_Training_Data_ALL_Available_Descriptions_EMOJIS_REPLACED.csv"
filepath = "FINALIZED_Training_Data_ALL_Available_Descriptions_EMOJIS_REPLACED_w_DICT_LABELS.csv"

hand_label = "hand.label_simplified"

df = pd.read_csv(filepath)

# Removing all the "-int" (international, non-English, descriptions)
#dict.fromkeys(df[hand_label])
df = df[((df[hand_label] == 'media') | (df[hand_label] == 'tourbiz') |(df[hand_label] == 'acad') | (df[hand_label] == 'gov') | (
        df[hand_label] == 'other'))]

# df = df[['username', 'description', hand_label, 'max.class']]  # keep only relevant columns

lemmatizer = WordNetLemmatizer()
words_not_changed = ['media']

# Lemmatization (preprocessing)
def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)  # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for
                 token, tag in pos_tag(row)]  # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)


df['description_lemmatized'] = df['description'].apply(preprocessing)

# Remove all the empty descriptions
df = df[df['description_lemmatized'] != ""]
#df[hand_label]
#print(df.shape)
#df[df['description_lemmatized'] != ""].shape

# split my data into training, and test sets
scaler = StandardScaler()

In [47]:
y_test = df['hand.label_simplified']
DICT_y_pred_test = df['single.label']

print(metrics.classification_report(y_test, DICT_y_pred_test))

              precision    recall  f1-score   support

        acad       0.58      0.55      0.56       538
         gov       0.17      0.28      0.21       128
       media       0.59      0.83      0.69      1624
       other       0.93      0.86      0.89      9439
     tourbiz       0.46      0.29      0.36       194

    accuracy                           0.83     11923
   macro avg       0.54      0.56      0.54     11923
weighted avg       0.85      0.83      0.83     11923



In [49]:

X = df['single.label']
y_labels = df['hand.label_simplified']

X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.2, random_state=42, stratify=y_labels)


In [50]:
#y_test = df['hand.label_simplified']
#DICT_y_pred_test = df['max.class']

print(metrics.classification_report(y_train, X_train))

              precision    recall  f1-score   support

        acad       0.59      0.54      0.57       430
         gov       0.19      0.30      0.23       103
       media       0.59      0.84      0.70      1299
       other       0.93      0.86      0.89      7551
     tourbiz       0.43      0.30      0.35       155

    accuracy                           0.83      9538
   macro avg       0.55      0.57      0.55      9538
weighted avg       0.85      0.83      0.84      9538

