In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re
import json
from sklearn.metrics import confusion_matrix
import pickle

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT

# filepath = "finalized_8K_accounts.csv"
# filepath = "UNLABELED_accounts_emojis_replaced.csv"
filepath = "FINALIZED_Unlabeled_Data_ALL_Available_Descriptions_EMOJIS_UNCHANGED.csv"
hand_label = "hand.label"
government = "gov"
academia = "acad"
tourBiz = "tourbiz"

df = pd.read_csv(filepath)

# df = df[((df[hand_label] == 'media') | (df[hand_label] == tourBiz) |(df[hand_label] == academia) | (df[hand_label] == government) | (
#        df[hand_label] == 'other'))]

df = df[['username', 'description']]  # keep only relevant columns

lemmatizer = WordNetLemmatizer()
words_not_changed = ['media']


def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)  # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for
                 token, tag in pos_tag(row)]  # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)


df['description_lemmatized'] = df['description'].apply(preprocessing)


In [38]:
# Remove all the empty descriptions
print(df.shape)
print(df[df['description_lemmatized'] != ""].shape) 

(14599, 3)
(13242, 3)


In [39]:
# Remove all the empty descriptions
df = df[df['description_lemmatized'] != ""]
print(df.shape)
#df[hand_label]
#print(df.shape)
#df[df['description_lemmatized'] != ""].shape

(13242, 3)


In [41]:
# Re-indexing the remaining observations
df = df.reset_index(drop=True)

In [42]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# print(type(df[['description_lemmatized']]))
embeddings = model.encode(df['description'].tolist())

In [43]:
# filename = 'SVM_BOW_unweighted_enhanced_model.pickle'
filename = 'SVM_BERT_unweighted_enhanced_model_full(1, 2).pickle'
loaded_model = pickle.load(open(filename, 'rb'))

X_test = embeddings

bag_of_words_y_pred_test = loaded_model.predict(X_test)

bag_of_words_y_pred_test

pred_prob = loaded_model.predict_proba(X_test)



In [44]:
pred_prob
bag_of_words_y_pred_test
pd.concat([pd.DataFrame(bag_of_words_y_pred_test), pd.DataFrame(pred_prob)], axis=1)

Unnamed: 0,0,0.1,1,2,3,4
0,other,0.000585,0.001464,0.001155,0.992605,0.004191
1,other,0.004342,0.000639,0.003219,0.982458,0.009342
2,other,0.005846,0.015085,0.049567,0.928598,0.000904
3,other,0.001891,0.001057,0.002662,0.994019,0.000370
4,other,0.006474,0.001169,0.005799,0.983785,0.002772
...,...,...,...,...,...,...
13237,other,0.001823,0.010809,0.064314,0.785241,0.137813
13238,other,0.007914,0.001513,0.048221,0.941749,0.000602
13239,other,0.492738,0.001781,0.002790,0.502624,0.000067
13240,other,0.012996,0.000962,0.000605,0.984303,0.001134


In [45]:
pred_prob.shape

(13242, 5)

In [46]:
pred_prob_df = pd.DataFrame(pred_prob, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

bag_of_words_y_pred_test.size

df['hand.label_simplified'] = bag_of_words_y_pred_test
#df = df.drop(columns=['description_lemmatized'])
df1 = pd.concat([df, pred_prob_df], axis=1)
#df1 = pd.DataFrame(my_array, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

df1
df1.shape
#pred_prob_df.shape
#len(bag_of_words_y_pred_test)

(13242, 9)

In [47]:
df1

Unnamed: 0,username,description,description_lemmatized,hand.label_simplified,acad_prob,gov_prob,media_prob,other_prob,tourbiz_prob
0,LeChatNoire4,#VOTE BLUE 2022 🌊🇺🇸🌊 #BuyARepublicanToday! no ...,"['#', 'vote', 'blue', '', '🌊🇺🇸🌊', '#', 'buyare...",other,0.000585,0.001464,0.001155,0.992605,0.004191
1,SethPlatt,Creator Collector Cultivator Art Web3 ENS AI S...,"['creator', 'collector', 'cultivator', 'art', ...",other,0.004342,0.000639,0.003219,0.982458,0.009342
2,eco_voice,"A non-partisan, independent, volunteer run org...","['a', 'non-partisan', ',', 'independent', ',',...",other,0.005846,0.015085,0.049567,0.928598,0.000904
3,Corn4Harvick,*Flo-Grown* 🇺🇸 🇺🇸 Jesus sent me back to straig...,"['*', 'flo-grown', '*', '🇺🇸', '🇺🇸', 'jesus', '...",other,0.001891,0.001057,0.002662,0.994019,0.000370
4,memorabiliaddy,Healthcare Professional * Dad to Two * MSU Alu...,"['healthcare', 'professional', '*', 'dad', 'to...",other,0.006474,0.001169,0.005799,0.983785,0.002772
...,...,...,...,...,...,...,...,...,...
13237,EvergreenZephyr,"Wichita, Kansas, United (sic) States. Parody a...","['wichita', ',', 'kansa', ',', 'united', '(', ...",other,0.001823,0.010809,0.064314,0.785241,0.137813
13238,johntfox,Madeleine & Marin's Dad | Gin Enthusiast | Twe...,"['madeleine', '&', 'marin', ""'s"", 'dad', '|', ...",other,0.007914,0.001513,0.048221,0.941749,0.000602
13239,SeGreene,Cranky former nurse and current plant patholog...,"['cranky', 'former', 'nurse', 'and', 'current'...",other,0.492738,0.001781,0.002790,0.502624,0.000067
13240,CherylLasse,"Passionate about the environment, science and ...","['passionate', 'about', 'the', 'environment', ...",other,0.012996,0.000962,0.000605,0.984303,0.001134


In [48]:
df1.to_csv(r'SVM_BERT_unweighted_UNLABELED_PREDICTED_accounts_W_PROBABILITIES_emojis_unchanged.csv', index=False)