In [1]:
# Import libraries
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import CountVectorizer, BERTTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re
import json
from sklearn.metrics import confusion_matrix
import pickle


# Initialization
lemmatizer = WordNetLemmatizer()
# count_vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 1))
# BERT_transformer = BERTTransformer()
nb_clf = MultinomialNB()

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT

# filepath = "finalized_8K_accounts.csv"
# filepath = "finalized_8K_accounts_emojis_replaced.csv"
# filepath = "FINALIZED_Training_Data_ALL_Available_Descriptions_EMOJIS_REPLACED.csv"
filepath = "FINALIZED_Training_Data_ALL_Available_Descriptions_EMOJIS_UNCHANGED.csv"

hand_label = "hand.label_simplified"

df = pd.read_csv(filepath)

# Removing all the "-int" (international, non-English, descriptions)
#dict.fromkeys(df[hand_label])
df = df[((df[hand_label] == 'media') | (df[hand_label] == 'tourbiz') |(df[hand_label] == 'acad') | (df[hand_label] == 'gov') | (
        df[hand_label] == 'other'))]

df = df[['username', 'description', hand_label]]  # keep only relevant columns

words_not_changed = ['media']

result = {}
n_gram_range = (1, 1)


def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)  # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for
                 token, tag in pos_tag(row)]  # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)


df['description_lemmatized'] = df['description'].apply(preprocessing)

# Remove all the empty descriptions
df = df[df['description_lemmatized'] != ""]
#df[hand_label]
#print(df.shape)
#df[df['description_lemmatized'] != ""].shape

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Sentences are encoded by calling model.encode()
#print(type(df[['description']]))
embeddings = model.encode(df['description'].tolist())

In [3]:
filename = 'BERT_df.pickle'
pickle.dump(df, open(filename, "wb"))

In [4]:
filename = 'BERT_embeddings.pickle'
pickle.dump(embeddings, open(filename, "wb"))

In [5]:
df.to_csv('BERT_df.csv', index=False)

In [8]:
#embeddings.to_csv('BERT_embeddings.csv', index=False)
np.savetxt("BERT_embeddings.csv", embeddings, delimiter=",")

In [4]:
embeddings
np.corrcoef(embeddings)

array([[ 1.        ,  0.14470369,  0.40523018, ...,  0.1269892 ,
         0.09070657,  0.10037136],
       [ 0.14470369,  1.        ,  0.19039135, ...,  0.10073964,
         0.07227026, -0.04743898],
       [ 0.40523018,  0.19039135,  1.        , ...,  0.16159729,
         0.10458214,  0.0318919 ],
       ...,
       [ 0.1269892 ,  0.10073964,  0.16159729, ...,  1.        ,
         0.11394472,  0.2495047 ],
       [ 0.09070657,  0.07227026,  0.10458214, ...,  0.11394472,
         1.        ,  0.02802738],
       [ 0.10037136, -0.04743898,  0.0318919 , ...,  0.2495047 ,
         0.02802738,  1.        ]])

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# split my data into training, and test sets
scaler = StandardScaler()

# X = df['description_lemmatized']
X = embeddings
y_labels = df[hand_label]

X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.2, random_state=42, stratify=y_labels)

# X2 = df2['description_lemmatized']
# Y2 = df2[hand_label]
#
# X_train = pd.concat([X_train, X2])
# y_train = pd.concat([y_train, Y2])


# n_gram_ranges = [(1,1), (1,2), (2,2)]
n_gram_ranges = [(1,1)]

result = {}
result_cv={}

for n_gram_range in n_gram_ranges:
    # count_vectorizer = CountVectorizer(stop_words="english", ngram_range=n_gram_range)

    nb_BERT_pipeline = Pipeline([
    	# ('vectorizer', count_vectorizer),
    	# ('transformer', BERT_transformer),
        ('normalize', MinMaxScaler()),
    	('classifier', nb_clf)
    	])
	
    print()
    print()
    print(n_gram_range)

    param_BERT_grid = [
    	{
    	    # 'vectorizer__min_df': [1,2,5],
    	    # 'transformer__use_idf': [True],
    	    'classifier__alpha': [1.0e-10, 0.5, 2.0, 5.0, 10.0],
    	    'classifier__fit_prior': [True, False],
    	}
    ]

    # !!! Does STRATIFICATION BY DEFAULT !!!
    grid_search_BERT = GridSearchCV(nb_BERT_pipeline, param_BERT_grid, cv=10, scoring='accuracy', verbose=1)
    grid_search_BERT.fit(X_train, y_train)
    nb_BERT_best_hyperparameters = grid_search_BERT.best_params_
    
    print()
    print()
    print("NAIVE BAYES UNWEIGHTED ENHANCED BEST PARAMS:", grid_search_BERT.best_params_)
    
    nb_BERT_best_model =  grid_search_BERT.best_estimator_
    nb_BERT_pipeline.set_params(**grid_search_BERT.best_params_)
    nb_BERT_pipeline.fit(X_train, y_train)
    y_pred_BERT_cross_validation = cross_val_predict(nb_BERT_pipeline, X_train, y_train, cv=10)
    print(y_pred_BERT_cross_validation)

    y_pred_BERT_test = nb_BERT_pipeline.predict(X_test)

    cm_BERT = confusion_matrix(y_train, y_pred_BERT_cross_validation, normalize='true')

    print()
    print()
    print()
    print("CV confusion matrix of predictions:")
    print()
    print(cm_BERT)
    
    # np.savetxt("NB_BERT_unweighted_enhanced_cross_validation_confusion_matrix" + str(n_gram_range) + '.txt', cm_BERT,
    #           delimiter=',', fmt='%f')


    result_cv["NB_BERT_unweighted_enhanced_predictions_CV" + str(n_gram_range)] = metrics.classification_report(y_train, y_pred_BERT_cross_validation)
    
    print()
    print()
    print()
    print("CV metrics summary:")
    print(result_cv["NB_BERT_unweighted_enhanced_predictions_CV" + str(n_gram_range)])
    
    
    result["NB_BERT_unweighted_enhanced_predictions_testSet" + str(n_gram_range)] = metrics.classification_report(y_test, y_pred_BERT_test)
    
    print()
    print()
    print()
    print("Test set metrics summary:")
    print()
    print(result["NB_BERT_unweighted_enhanced_predictions_testSet" + str(n_gram_range)])
    
    print()
    print()
    print()
    
    
    filename = 'NB_BERT_unweighted_enhanced_model' + str(n_gram_range) + '.pickle'
    # save model
    pickle.dump(nb_BERT_pipeline, open(filename, "wb"))

    # full_x = pd.concat([X_train, X_test])
    # full_y = pd.concat([y_train, y_test])

    # bag_of_words_grid_search.fit(full_x, full_y)
    nb_BERT_pipeline.set_params(**grid_search_BERT.best_params_)
    # nb_BERT_pipeline.fit(full_x, full_y)
    nb_BERT_pipeline.fit(X, y_labels)

    filename = 'NB_BERT_unweighted_enhanced_model_full' + str(n_gram_range) + '.pickle'
    pickle.dump(nb_BERT_pipeline, open(filename, "wb"))
    




(1, 1)
Fitting 10 folds for each of 10 candidates, totalling 100 fits


NAIVE BAYES UNWEIGHTED ENHANCED BEST PARAMS: {'classifier__alpha': 1e-10, 'classifier__fit_prior': True}
['other' 'other' 'other' ... 'other' 'other' 'other']



CV confusion matrix of predictions:

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.21632025e-01 8.78367975e-01
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 9.27029533e-04 9.99072970e-01
  0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))





CV metrics summary:
              precision    recall  f1-score   support

        acad       0.00      0.00      0.00       430
         gov       0.00      0.00      0.00       103
       media       0.96      0.12      0.22      1299
       other       0.80      1.00      0.89      7551
     tourbiz       0.00      0.00      0.00       155

    accuracy                           0.81      9538
   macro avg       0.35      0.22      0.22      9538
weighted avg       0.77      0.81      0.74      9538




Test set metrics summary:

              precision    recall  f1-score   support

        acad       0.00      0.00      0.00       108
         gov       0.00      0.00      0.00        25
       media       0.97      0.09      0.16       325
       other       0.80      1.00      0.89      1888
     tourbiz       0.00      0.00      0.00        39

    accuracy                           0.80      2385
   macro avg       0.35      0.22      0.21      2385
weighted avg       0.77 

  _warn_prf(average, modifier, msg_start, len(result))
