In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
import re
import json
from sklearn.metrics import confusion_matrix
import pickle

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tag_map['AS'] = wn.ADJ_SAT

# filepath = "finalized_8K_accounts.csv"
# filepath = "finalized_8K_accounts_emojis_replaced.csv"
# filepath = "FINALIZED_Training_Data_ALL_Available_Descriptions_EMOJIS_REPLACED.csv"
filepath = "FINALIZED_Training_Data_ALL_Available_Descriptions_EMOJIS_UNCHANGED.csv"

hand_label = "hand.label_simplified"

df = pd.read_csv(filepath)

# Removing all the "-int" (international, non-English, descriptions)
#dict.fromkeys(df[hand_label])
df = df[((df[hand_label] == 'media') | (df[hand_label] == 'tourbiz') |(df[hand_label] == 'acad') | (df[hand_label] == 'gov') | (
        df[hand_label] == 'other'))]

df = df[['username', 'description', hand_label]]  # keep only relevant columns


In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

print(type(['This framework generates embeddings for each input sentence. Sentences are passed as a list of string.',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']))

model.encode(['This framework generates embeddings for each input sentence. Sentences are passed as a list of string.',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.'])

print(df.size)

df = df[df['description'] != ""]

print(df.size)

print(df)


<class 'list'>
47169
47169
              username                                        description  \
0                  CNN  It‚Äôs our job to #GoThere & tell the most diffi...   
1               NatGeo  Taking our understanding and awareness of the ...   
2              FoxNews  Follow America's #1 cable news network, delive...   
3       washingtonpost                         Democracy Dies in Darkness   
4                  ABC  The only official ABC News Twitter account. Do...   
...                ...                                                ...   
15886       OneilRoses                                                NaN   
15887         mmccue56                                                NaN   
15888       montique47                                                NaN   
15889    ScottRedmond3                                                NaN   
15890  Smit46571445Bob                                                NaN   

      hand.label_simplified  
0               

In [3]:
lemmatizer = WordNetLemmatizer()
words_not_changed = ['media']

# Lemmatization (preprocessing)
def preprocessing(row):
    if str(row) == "nan":
        lemma = ""
    else:
        row = str(row).lower()
        row = word_tokenize(row)  # tokenize
        lemma = [lemmatizer.lemmatize(token, tag_map[tag[0]]) if token not in words_not_changed else token for
                 token, tag in pos_tag(row)]  # lemmatization, depending on part-of-speech
        lemma = ["" if re.search(r'\b[0-9]+\b\s*', lem) else lem for lem in lemma]  # removing
    return str(lemma)


df['description_lemmatized'] = df['description'].apply(preprocessing)

# Remove all the empty descriptions
df = df[df['description_lemmatized'] != ""]
#df[hand_label]
#print(df.shape)
#df[df['description_lemmatized'] != ""].shape





from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

print(type(['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']))

#Sentences are encoded by calling model.encode()
# print(type(df[['description_lemmatized']]))
# embeddings = model.encode(df['description_lemmatized'].tolist())



<class 'list'>
<class 'pandas.core.frame.DataFrame'>


KeyboardInterrupt: 

In [17]:
#print(df.size)
#print(df)
print(df['description_lemmatized'].tolist())



In [20]:
print(embeddings)
print(embeddings.shape)

[[-0.04816406 -0.07923298 -0.03724848 ...  0.08019924  0.01455128
   0.0456624 ]
 [ 0.06242061 -0.01987889 -0.00416512 ...  0.11200301 -0.03849842
   0.01369527]
 [-0.02185335 -0.13038395 -0.0307673  ...  0.10546998 -0.02063878
   0.0318699 ]
 ...
 [ 0.07011544  0.04272195  0.03821721 ...  0.04347302 -0.06767345
   0.05230111]
 [-0.04544054 -0.12075313  0.04004586 ...  0.11077183 -0.02724272
  -0.07071622]
 [ 0.02156898  0.06100883  0.04015845 ...  0.11042178 -0.00149351
   0.01260704]]
(11923, 384)


In [4]:
# Enhanced data
# filepath = "finalized_BIASED_accounts_ONLY_NON_OTHER_emojis_replaced.csv"

# df2 = pd.read_csv(filepath)
# df2 = df2[((df2[hand_label] == 'media') | (df[hand_label] == 'tourbiz') | (df2[hand_label] == 'acad') | (df2[hand_label] == 'gov') | (
#         df2[hand_label] == 'other'))]

# df2 = df2[['username', 'description', hand_label]]  # keep only relevant columns

# df2['description_lemmatized'] = df2['description'].apply(preprocessing)

# split my data into training, and test sets
scaler = StandardScaler()

X = df['description_lemmatized']
# X = embeddings
y_labels = df[hand_label]

X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.2, random_state=42, stratify=y_labels)

# X2 = df2['description_lemmatized']
# Y2 = df2[hand_label]

# X_train = pd.concat([X_train, X2])
# y_train = pd.concat([y_train, Y2])



tfidf_transformer = TfidfTransformer()

# n_gram_ranges = [(1,1), (1,2), (2,2)]
n_gram_ranges = [(1,1)]

result = {}
result_cv = {}

for n_gram_range in n_gram_ranges:
    # count_vectorizer = CountVectorizer(stop_words="english", ngram_range=n_gram_range)
    tfidf_pipeline = Pipeline([
       # ('vectorizer', count_vectorizer),
       # ('transformer', tfidf_transformer),
        ('normalize', StandardScaler(with_mean=False)),
        ('classifier', SVC(probability=True))
    ])

    print(n_gram_range)
    
    tfidf_param_grid = [
        {
            #'vectorizer__min_df': [5],
            #'transformer__use_idf': [True],
            
            # 'classifier__C': [0.5, 3.0],
            # 'classifier__kernel': ['linear'],
            # 'classifier__class_weight': ["balanced"]
            
            'classifier__C': [1.0e-10, 0.5, 3.0, 10.0],
            'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'classifier__class_weight': ["balanced"]
        }
    ]

    # !!! Does STRATIFICATION BY DEFAULT !!!
    tfidf_grid_search = GridSearchCV(estimator=tfidf_pipeline, param_grid=tfidf_param_grid, cv=3,
                                            scoring='accuracy', verbose=1, error_score="raise")
    tfidf_grid_search.fit(X_train, y_train)
    tfidf_best_hyperparameters = tfidf_grid_search.best_params_
    
    print()
    print()
    print("SVM WEIGHT ENHANCED BEST PARAMS:", tfidf_best_hyperparameters)

    tfidf_best_SVM_model = tfidf_grid_search.best_estimator_
    tfidf_pipeline.set_params(**tfidf_grid_search.best_params_)
    tfidf_pipeline.fit(X_train, y_train)
    y_pred_tfidf_cross_validation = cross_val_predict(tfidf_best_SVM_model, X_train, y_train, cv=3)
    tfidf_y_pred_test = tfidf_pipeline.predict(X_test)

    cm_count = confusion_matrix(y_train, y_pred_tfidf_cross_validation, normalize='true')
    
    print()
    print()
    print()
    print("CV confusion matrix of predictions:")
    print()
    print(cm_count)
    
    # np.savetxt("SVM_TFIDF_weighted_enhanced_cross_validation_confusion_matrix" + str(n_gram_range) + '.txt', cm_count,
    #            delimiter=',', fmt='%f')

    result_cv["SVM_TFIDF_weighted_enhanced_predictions_CV" + str(n_gram_range)] = metrics.classification_report(y_train, y_pred_tfidf_cross_validation)


    print()
    print()
    print()
    print("CV metrics summary:")
    print(result_cv["SVM_TFIDF_weighted_enhanced_predictions_CV" + str(n_gram_range)])
    
    result["SVM_TFIDF_weighted_enhanced_predictions_testSet" + str(n_gram_range)] = metrics.classification_report(y_test, tfidf_y_pred_test)
                         
    print()
    print()
    print()
    print("Test set metrics summary:")
    print()
    print(result["SVM_TFIDF_weighted_enhanced_predictions_testSet" + str(n_gram_range)])
    
    print()
    print()
    print()                                                                                       
                                                                                                                
    # filename = 'SVM_TFIDF_weighted_enhanced_model' + str(n_gram_range) + '.pickle'
    # save model
    # pickle.dump(tfidf_pipeline, open(filename, "wb"))

    #full_x = pd.concat([X_train, X_test])
    #full_y = pd.concat([y_train, y_test])

    # tfidf_grid_search.fit(full_x, full_y)
    tfidf_pipeline.set_params(**tfidf_grid_search.best_params_)
    # tfidf_pipeline.fit(full_x, full_y)
    tfidf_pipeline.fit(X, y_labels)

    # filename = 'SVM_TFIDF_weighted_enhanced_model_full' + str(n_gram_range) + '.pickle'
    # pickle.dump(tfidf_pipeline, open(filename, "wb"))
    
    # def save_dict_to_file(dictionary, filename):
    #	with open(filename, 'w') as file:
    #    	json.dump(dictionary, file)
    #
    # save_dict_to_file(result["SVM_TFIDF_weighted_enhanced_predictions_testSet" + str(n_gram_range)], 'SVM_TFIDF_Weighted_full' + str(n_gram_range) + '.txt')

# print(result)


(1, 1)
Fitting 3 folds for each of 16 candidates, totalling 48 fits


ValueError: could not convert string to float: "['amateur', 'historian', '.']"