In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import StandardScaler



In [3]:
# Preprocess the Data: from naivebayes2 (Melvin Adkins work)

lemmatizer = WordNetLemmatizer()

filepath = "finalized_8K_accounts.csv"
hand_label = "hand.label"
government = "gov"
academia = "acad"

df = pd.read_csv(filepath)

df = df[((df[hand_label]=='media') | (df[hand_label]== academia) | (df[hand_label]==government) | (df[hand_label]=='other' ))]


df = df[['username','description',hand_label]] # keep only relevant columns


# Preprocessing step - lemmatization on description column
words_not_changed = ['media']

def preprocessing(row):

    before = []
    after = []
    if str(row) == "nan":
        row = ""
    else:
        row = str(row).lower()          # lowercase (so that upper and lowercase words are treated the same)
        row = word_tokenize(row)        # tokenize  (to perform lemmitization
        row = [lemmatizer.lemmatize(word) if word not in words_not_changed else word for word in row]   # lemmatize

    return str(row)                     # convert back to string


df['description_lemmatized'] = df['description'].apply(preprocessing)


print(df.head())
print()

print(df.head())
print()
print('Number of labels per category:')
print(df[hand_label].value_counts())
print()

       username                                        description hand.label  \
0   Casper30214  Army Civil Service(Retired);Military Ops Resea...      other   
1         enckj  Former EPA Regional Administrator, President o...       acad   
2  nuclearkelly  Scientist at ORNL, DOE Early Career Awardee, F...       acad   
3       stukhan  Dad. Director of the Australian Graduate Schoo...       acad   
4       PatMag7  Podcasting about Feminist Participatory Action...       acad   

                              description_lemmatized  
0  ['army', 'civil', 'service', '(', 'retired', '...  
1  ['former', 'epa', 'regional', 'administrator',...  
2  ['scientist', 'at', 'ornl', ',', 'doe', 'early...  
3  ['dad', '.', 'director', 'of', 'the', 'austral...  
4  ['podcasting', 'about', 'feminist', 'participa...  

       username                                        description hand.label  \
0   Casper30214  Army Civil Service(Retired);Military Ops Resea...      other   
1         enckj  For

In [26]:
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re


tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT


# filepath = "finalized_8K_accounts.csv"
filepath = "finalized_8K_accounts_emojis_replaced.csv"
hand_label = "hand.label"
government = "gov"
academia = "acad"

df = pd.read_csv(filepath)

df = df[((df[hand_label]=='media') | (df[hand_label]== academia) | (df[hand_label]==government) | (df[hand_label]=='other' ))]


df = df[['username','description',hand_label]] # keep only relevant columns

lemmatizer = WordNetLemmatizer()
words_not_changed = ['media']

def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)        # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for token, tag in pos_tag(row)] # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)

df['description_lemmatized'] = df['description'].apply(preprocessing)


print(df.head())
print()

print(df.head())
print()
print('Number of labels per category:')
print(df[hand_label].value_counts())
print()

       username                                        description hand.label  \
0   Casper30214  Army Civil Service(Retired);Military Ops Resea...      other   
1         enckj  Former EPA Regional Administrator, President o...       acad   
2  nuclearkelly  Scientist at ORNL, DOE Early Career Awardee, F...       acad   
3       stukhan  Dad. Director of the Australian Graduate Schoo...       acad   
4       PatMag7  Podcasting about Feminist Participatory Action...       acad   

                              description_lemmatized  
0  ['army', 'civil', 'service', '(', 'retire', ')...  
1  ['former', 'epa', 'regional', 'administrator',...  
2  ['scientist', 'at', 'ornl', ',', 'doe', 'early...  
3  ['dad', '.', 'director', 'of', 'the', 'austral...  
4  ['podcast', 'about', 'feminist', 'participator...  

       username                                        description hand.label  \
0   Casper30214  Army Civil Service(Retired);Military Ops Resea...      other   
1         enckj  For

In [27]:
# Enhanced data
filepath = "finalized_BIASED_accounts_ONLY_NON_OTHER.csv"


df2 = pd.read_csv(filepath)
df2 = df2[((df2[hand_label]=='media') | (df2[hand_label]== academia) | (df2[hand_label]==government) | (df2[hand_label]=='other' ))]

df2 = df2[['username','description',hand_label]] # keep only relevant columns


df2['description_lemmatized'] = df2['description'].apply(preprocessing)

print(df2.head())
print()
print('Number of labels per category:')
print(df2[hand_label].value_counts())
print()


          username                                        description  \
0  conserveturtles  STC is the oldest sea turtle conservation orga...   
1  WhySharksMatter  Research associate at @ASU @ASUinDC studying s...   
2     BenDiamondFL  Husband, Father and proud Floridian | Florida ...   
3    NancyRichmond  Speaker 🗣️ | University Professor 📚  | Social ...   
4       stemdotorg  Science 🔬 Technology 🛰 Engineering ⚙️ Math 📐 E...   

  hand.label                             description_lemmatized  
0       acad  ['stc', 'be', 'the', 'old', 'sea', 'turtle', '...  
1       acad  ['research', 'associate', 'at', '@', 'asu', '@...  
2       acad  ['husband', ',', 'father', 'and', 'proud', 'fl...  
3       acad  ['speaker', '🗣️', '|', 'university', 'professo...  
4       acad  ['science', '🔬', 'technology', '🛰', 'engineeri...  

Number of labels per category:
hand.label
media    887
acad     221
gov       42
Name: count, dtype: int64



In [38]:
# split my data into training, and test sets
scaler = StandardScaler()

X = df['description_lemmatized']
y_labels = df[hand_label]

X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.2, random_state=42, stratify=y_labels)

In [41]:
X2 = df2['description_lemmatized']
Y2 = df2[hand_label]

X_train = pd.concat([X_train, X2])
y_train = pd.concat([y_train, Y2])

In [42]:
# print("SHAPE:", X_train.shape)

SHAPE: (7431,)


In [43]:
# print(X.value_counts())

description_lemmatized
                                                                                                                                                                                                                                                       1358
['teacher']                                                                                                                                                                                                                                               3
['she/her']                                                                                                                                                                                                                                               3
['']                                                                                                                                                                                                                         

In [44]:
# print(X_train.value_counts())

description_lemmatized
                                                                                                                                                                                                                                                                                                                                                                  1092
['she/her']                                                                                                                                                                                                                                                                                                                                                          3
['.']                                                                                                                                                                                                                                                              

In [9]:
# print(y_test.value_counts())

hand.label
other    1447
media      80
acad       37
gov         7
Name: count, dtype: int64


In [10]:
# nan_counts = X_train.isna().sum()
# print(nan_counts)

0


In [11]:
# nan_counts = y_train.isna().sum()
# print(nan_counts)

0


In [12]:
tfidf_transformer = TfidfTransformer()
count_vectorizer = CountVectorizer(stop_words="english",ngram_range=(1,2) )

In [13]:

tfidf_pipeline = Pipeline([
    ('vectorizer', count_vectorizer),
    ('transformer', tfidf_transformer),
    ('normalize', StandardScaler(with_mean=False)),
    ('classifier', SVC())
])

tfidf_param_grid = [
    {
        ''
        'vectorizer__min_df': [0.0],
        'transformer__use_idf': [True],
        # 'normalize__with_mean': [False],
        'classifier__C': [1.0e-10, 0.5, 3.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'classifier__class_weight': ["balanced"]
    }
]

In [None]:
count_vectorizer = CountVectorizer(stop_words="english",ngram_range=(1,2) )

In [14]:
# vectorizer = CountVectorizer(ngram_range=(3, 3))
# X = vectorizer.fit_transform(X_train)
# print(len(vectorizer.get_feature_names_out()))
"""
14877 1 1

52688 2 2

61509 3 3
"""

'\n14877 1 1\n\n52688 2 2\n\n61509 3 3\n'

In [15]:
bag_of_words_pipeline = Pipeline([
    ('vectorizer', count_vectorizer),
    ('normalize', StandardScaler(with_mean=False)),
    ('classifier', SVC())
])

bag_of_words_param_grid = [
    {
        'vectorizer__min_df': [0.0],
        # 'normalize__with_mean': [False],
        'classifier__C': [1.0e-10, 0.5, 3.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'classifier__class_weight': ["balanced"]
    }
]


In [None]:
# Grid Search - TF-IDF
tfidf_grid_search = GridSearchCV(estimator=tfidf_pipeline, param_grid=tfidf_param_grid, cv=5, scoring='accuracy', verbose=1, error_score="raise")

tfidf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [None]:
# Bag Of Words
bag_of_words_grid_search = GridSearchCV(estimator=bag_of_words_pipeline, param_grid=bag_of_words_param_grid, cv=5, scoring='accuracy', verbose=1,error_score="raise" )

bag_of_words_grid_search.fit(X_train, y_train)

In [None]:
tfidf_best_hyperparameters = tfidf_grid_search.best_params_
# print("Best TF-IDF SVM ACCURACY:", tfidf_grid_search.best_score_)
# print("Best TF-IDF SVM Hyperparameters:", tfidf_best_hyperparameters)

# print()

In [None]:
bag_of_words_best_hyperparameters = bag_of_words_grid_search.best_params_
# print("Best TF-IDF Bag of Words ACCURACY:", bag_of_words_grid_search.best_score_)
# print("Best TF-IDF Bag of Words Hyperparameters:", bag_of_words_best_hyperparameters)

# print()

In [None]:
# print(tfidf_grid_search.cv_results_)

In [None]:
tfidf_best_SVM_model = tfidf_grid_search.best_estimator_
tfidf_pipeline.set_params(**tfidf_grid_search.best_params_)
tfidf_pipeline.fit(X_train, y_train)


In [None]:
bag_of_words_best_SVM_model = bag_of_words_grid_search.best_estimator_
bag_of_words_pipeline.set_params(**bag_of_words_grid_search.best_params_)
bag_of_words_pipeline.fit(X_train, y_train)

In [None]:
# Cross validate
y_pred_tfidf = cross_val_predict(tfidf_best_SVM_model, X_train, y_train, cv=5)

In [None]:
# Cross validate

y_pred_bag_of_words = cross_val_predict(bag_of_words_best_SVM_model, X_train, y_train, cv=5)


In [None]:
# cm = confusion_matrix(y_train, y_pred_tfidf, normalize='true')

# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['academia', 'government', 'media', 'other'])
# disp.plot()

# plt.title("TF-IDF Train")
# plt.show()

In [None]:
# cm = confusion_matrix(y_train, y_pred_bag_of_words, normalize='true')

# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['academia', 'government', 'media', 'other'])
#disp.plot()

# plt.title("Bag Of Words Train")
# plt.show()

In [None]:
# print("TF-IDF Classification Report:")
# print(metrics.classification_report(y_train, y_pred_tfidf))

In [None]:
# print("Bag of Words Classification Report:")
# print(metrics.classification_report(y_train, y_pred_bag_of_words))

In [None]:
tfidf_y_pred_test = tfidf_pipeline.predict(X_test)

In [None]:
bag_of_words_y_pred_test = bag_of_words_pipeline.predict(X_test)

In [None]:
print("TF-IDF Classification Report TEST:")
print(metrics.classification_report(y_test, tfidf_y_pred_test))
print()

print("Bag of Words Classification Report TEST:")
print(metrics.classification_report(y_test, bag_of_words_y_pred_test))
print()

-----------------------------------------------------------------------------------------------

In [None]:
cm = confusion_matrix(y_test, tfidf_y_pred_test, normalize='true')

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['academia', 'government', 'media', 'other'])
disp.plot()

plt.title("TF-IDF TEST SET")
plt.show()

In [None]:
cm = confusion_matrix(y_test, bag_of_words_y_pred_test, normalize='true')

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['academia', 'government', 'media', 'other'])
disp.plot()

plt.title("Bag of Words TEST SET")
plt.show()

In [None]:
print("TF-IDF Classification Report TEST:")
print(metrics.classification_report(y_test, tfidf_y_pred_test))
print()

In [None]:
print("Bag of Words Classification Report TEST:")
print(metrics.classification_report(y_test, bag_of_words_y_pred_test))
print()

In [None]:
# TODO:
"""
For both SVM and NaiveBayes :

Run this on Bigrams:
ngram_range(1,1) (2,2) (3,3)

"""

print("Done.")

In [None]:
"""

No Weights:
---------------------------------------------------------------
(1,1)
Bag of Words Classification Report TEST:
              precision    recall  f1-score   support

        acad       0.88      0.19      0.31        37
         gov       0.00      0.00      0.00         7
       media       0.79      0.46      0.58        80
       other       0.95      0.99      0.97      1447

    accuracy                           0.94      1571
   macro avg       0.65      0.41      0.47      1571
weighted avg       0.93      0.94      0.93      1571

TF-IDF Classification Report TEST:
              precision    recall  f1-score   support

        acad       0.73      0.22      0.33        37
         gov       0.00      0.00      0.00         7
       media       0.88      0.44      0.58        80
       other       0.95      1.00      0.97      1447

    accuracy                           0.94      1571
   macro avg       0.64      0.41      0.47      1571
weighted avg       0.93      0.94      0.93      1571

---------------------------------------------------------------
ngram range Unigrams and Bigrams : (1,2)

Bag of Words Classification Report TEST:
              precision    recall  f1-score   support

        acad       1.00      0.03      0.05        37
         gov       1.00      0.14      0.25         7
       media       1.00      0.12      0.22        80
       other       0.93      1.00      0.96      1447

    accuracy                           0.93      1571
   macro avg       0.98      0.32      0.37      1571
weighted avg       0.93      0.93      0.90      1571

TF-IDF Classification Report TEST:
              precision    recall  f1-score   support

        acad       0.80      0.11      0.19        37
         gov       1.00      0.14      0.25         7
       media       0.96      0.30      0.46        80
       other       0.94      1.00      0.97      1447

    accuracy                           0.94      1571
   macro avg       0.92      0.39      0.47      1571
weighted avg       0.94      0.94      0.92      1571

---------------------------------------------------------------
(2,2)
Bag of Words Classification Report:
              precision    recall  f1-score   support

        acad       0.00      0.00      0.00       150
         gov       0.00      0.00      0.00        29
       media       1.00      0.06      0.11       317
       other       0.92      1.00      0.96      5785

    accuracy                           0.92      6281
   macro avg       0.48      0.26      0.27      6281
weighted avg       0.90      0.92      0.89      6281

TF-IDF Classification Report:
              precision    recall  f1-score   support

        acad       0.47      0.09      0.16       150
         gov       0.00      0.00      0.00        29
       media       0.76      0.28      0.41       317
       other       0.94      0.99      0.96      5785

    accuracy                           0.93      6281
   macro avg       0.54      0.34      0.38      6281
weighted avg       0.91      0.93      0.91      6281


---------------------------------------------------------------



Weighted:

---------------------------------------------------------------
(1,1)
Bag of Words Classification Report:
              precision    recall  f1-score   support

        acad       0.65      0.34      0.45       150
         gov       0.25      0.14      0.18        29
       media       0.77      0.47      0.58       317
       other       0.95      0.99      0.97      5785

    accuracy                           0.94      6281
   macro avg       0.66      0.48      0.54      6281
weighted avg       0.93      0.94      0.93      6281


TF-IDF Classification Report:
              precision    recall  f1-score   support

        acad       0.72      0.26      0.38       150
         gov       0.36      0.14      0.20        29
       media       0.82      0.47      0.60       317
       other       0.95      0.99      0.97      5785

    accuracy                           0.94      6281
   macro avg       0.71      0.47      0.54      6281
weighted avg       0.93      0.94      0.93      6281

---------------------------------------------------------------
(2,2)

Bag of Words Classification Report:
              precision    recall  f1-score   support

        acad       0.00      0.00      0.00       150
         gov       0.00      0.00      0.00        29
       media       1.00      0.06      0.11       317
       other       0.92      1.00      0.96      5785

    accuracy                           0.92      6281
   macro avg       0.48      0.26      0.27      6281
weighted avg       0.90      0.92      0.89      6281


TF-IDF Classification Report:
              precision    recall  f1-score   support

        acad       0.47      0.09      0.16       150
         gov       0.00      0.00      0.00        29
       media       0.76      0.28      0.41       317
       other       0.94      0.99      0.96      5785

    accuracy                           0.93      6281
   macro avg       0.54      0.34      0.38      6281
weighted avg       0.91      0.93      0.91      6281
---------------------------------------------------------------

compare F-1 scores for specifics:
based on just f1- accuracy:
the one with the best F-1 scores in weighted SVM
the one with the best F-1 scores in unweighted SVM



what is best NB or SVM
TF-IDF VS BOW

uni vs bi grams
uni weighted vs uni unweighted


"""

print("Done.")