In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re
import json
from sklearn.metrics import confusion_matrix
import pickle

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT

# filepath = "finalized_8K_accounts.csv"
# filepath = "UNLABELED_accounts_emojis_replaced.csv"
filepath = "FINALIZED_Unlabeled_Data_ALL_Available_Descriptions_EMOJIS_UNCHANGED.csv"
hand_label = "hand.label"
government = "gov"
academia = "acad"
tourBiz = "tourbiz"

df = pd.read_csv(filepath)

# df = df[((df[hand_label] == 'media') | (df[hand_label] == tourBiz) |(df[hand_label] == academia) | (df[hand_label] == government) | (
#        df[hand_label] == 'other'))]

df = df[['username', 'description']]  # keep only relevant columns

lemmatizer = WordNetLemmatizer()
words_not_changed = ['media']


def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)  # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for
                 token, tag in pos_tag(row)]  # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)


df['description_lemmatized'] = df['description'].apply(preprocessing)


In [None]:
# Remove all the empty descriptions
df = df[df['description_lemmatized'] != ""]
#df[hand_label]
#print(df.shape)
#df[df['description_lemmatized'] != ""].shape

In [2]:
# filename = 'SVM_BOW_unweighted_enhanced_model.pickle'
filename = 'SVM_BOW_unweighted_enhanced_model_full(1, 1).pickle'
loaded_model = pickle.load(open(filename, 'rb'))

In [3]:
X_test = df['description_lemmatized']

In [4]:
bag_of_words_y_pred_test = loaded_model.predict(X_test)

In [5]:
bag_of_words_y_pred_test

array(['other', 'other', 'other', ..., 'other', 'other', 'other'],
      dtype=object)

In [6]:
pred_prob = loaded_model.predict_proba(X_test)

In [7]:
pred_prob_df = pd.DataFrame(pred_prob, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

In [8]:
bag_of_words_y_pred_test.size

14599

In [9]:
df['hand.label_simplified'] = bag_of_words_y_pred_test
#df = df.drop(columns=['description_lemmatized'])
df1 = pd.concat([df, pred_prob_df], axis=1)
#df1 = pd.DataFrame(my_array, columns = ['acad_prob','gov_prob','media_prob','other_prob', 'tourbiz_prob'])

In [10]:
df1

Unnamed: 0,username,description,description_lemmatized,hand.label_simplified,acad_prob,gov_prob,media_prob,other_prob,tourbiz_prob
0,LeChatNoire4,#VOTE BLUE 2022 🌊🇺🇸🌊 #BuyARepublicanToday! no ...,"['#', 'vote', 'blue', '', '🌊🇺🇸🌊', '#', 'buyare...",other,0.007508,0.002201,0.031050,0.956922,0.002318
1,SethPlatt,Creator Collector Cultivator Art Web3 ENS AI S...,"['creator', 'collector', 'cultivator', 'art', ...",other,0.000589,0.000454,0.000105,0.998610,0.000241
2,eco_voice,"A non-partisan, independent, volunteer run org...","['a', 'non-partisan', ',', 'independent', ',',...",other,0.003138,0.011350,0.051886,0.932396,0.001230
3,Corn4Harvick,*Flo-Grown* 🇺🇸 🇺🇸 Jesus sent me back to straig...,"['*', 'flo-grown', '*', '🇺🇸', '🇺🇸', 'jesus', '...",other,0.002120,0.006770,0.004492,0.985610,0.001008
4,memorabiliaddy,Healthcare Professional * Dad to Two * MSU Alu...,"['healthcare', 'professional', '*', 'dad', 'to...",other,0.002768,0.003740,0.014110,0.978101,0.001281
...,...,...,...,...,...,...,...,...,...
14594,insonifier,,,other,0.011411,0.002842,0.050542,0.931984,0.003222
14595,johntfox,Madeleine & Marin's Dad | Gin Enthusiast | Twe...,"['madeleine', '&', 'marin', ""'s"", 'dad', '|', ...",other,0.009045,0.003301,0.023513,0.962778,0.001362
14596,SeGreene,Cranky former nurse and current plant patholog...,"['cranky', 'former', 'nurse', 'and', 'current'...",other,0.024843,0.002913,0.023220,0.948067,0.000957
14597,CherylLasse,"Passionate about the environment, science and ...","['passionate', 'about', 'the', 'environment', ...",other,0.183329,0.002451,0.020496,0.792738,0.000985


In [11]:
df1.to_csv(r'SVM_BOW_unweighted_UNLABELED_PREDICTED_accounts_W_PROBABILITIES_emojis_unchanged.csv', index=False)