In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm
import json
import os
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import warnings
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

In [2]:
# initial dataset stuff
nlp = spacy.load("spacy-twitter") # out of function so you don't load it every time (it takes a while)

# function for glove embeddings
def embed_dataset(dataset_text):
    encoded = np.array([nlp(text).vector for text in dataset_text])
    return encoded.tolist()

# function to load dataset from folder. Also embeds the text.
def get_dataset(name):
    """
    loads a dataset and embeds the text. text must be in column named "text".
    datasets are in the folder datasets/
    name must be a string that's matches the csv file in datasets
    """
    dataset = pd.read_csv(f'datasets/{name}.csv')
    dataset.rename(columns = {"Unnamed: 0":"entry"}, inplace=True) #the entry label never carries over
    dataset['e_text'] = embed_dataset(dataset['text'])
    return dataset

In [3]:
def evaluate_model(model, X_test, y_test):
    pred_y = model.predict(X_test)
    acc_mod = accuracy_score(y_test, pred_y)
    print("Accuracy:", float("{0:.2f}".format(acc_mod*100)), "%")
    f1_mod = f1_score(y_test, pred_y, average="macro")
    print("F1:", float("{0:.2f}".format(f1_mod*100)), "%")
    cm = confusion_matrix(y_test, pred_y)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["true", "false"])
    disp.plot()
    plt.show() 
    
def optimize_model_v2(search_space, objective, evals):
    trials = Trials()
    best_params = fmin(
        fn = objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=evals,
        timeout=120,
        trials=trials
    )
    set_params = space_eval(search_space, best_params)
    score = trials.best_trial['result']['loss']
    return set_params, score

In [4]:
def train_models(dataset_name, train_set, confidence, size_limit, model_list):
    """
    trains a set of models in each category. returns the best model for each category, in the form {'category': [modelscore, modelname, fittedmodel]}

    dataset_name: a string with the name of the training set. used for calling the category file
    train_set: the training set to use
    confidence: the confidence required to consider an entry part of a category
    size_limit: the number of entries needed in a category to consider that category for training
    model_list: the list of models to train. in the form [("model_name1", model1), etc]
    """
    file_name = f"{dataset_name}_cats/{dataset_name}_categories_organised.json"
    f = open(file_name)
    data = json.load(f)
    f.close()
    category_models = {} #this will be returned
    for category in data.keys(): 
        cat_entries = [int(i) for i in data[category].keys() if data[category][i] > confidence]
        
        # skip category if size of category is below limit
        if len(cat_entries) < size_limit:
            print(f"Skipped category: {category} due to low numbers")
            continue
        
        category_data = train_set.filter(axis=0, items=cat_entries)

        #split validation set
        X = category_data.drop('target', axis=1)
        y = category_data["target"]
        try:
            X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
        except:
            print(f"Skipped category: {category} due to class issues")
            continue

        # skip category if split only has one class
        if (len(np.unique(y_train)) <= 1):
            print(f"Skipped category: {category} due to class issues")
            continue

        X_train_text = np.array([text for text in X_train['e_text']])
        X_val_text = np.array([text for text in X_val['e_text']])

        trained_models = []
        # train models from list
        for model_name, search_space, mod in model_list:
            def objective(search_space):
                warnings.filterwarnings('ignore')
                model = mod.set_params(**search_space)
                model.fit(X_train_text, y_train)
                y_pred = model.predict(X_val_text)
                accuracy = accuracy_score(y_val, y_pred)
                return {'loss': -accuracy, 'status': STATUS_OK}
            try:
                best_params, score = optimize_model_v2(search_space, objective, 200)
                mod.set_params(**best_params)
                score *= -1
                trained_models.append((model_name, mod))
                #print(f"Trained {model_name} on {category}")
            except:
                print(f"Error training {model_name} in category {category}, skipping")
                continue
            trained_models.append((model_name, mod))

        #get the best model
        best_model = [0, "x", "x"]
        for name, model in trained_models:
            score = model.score(X_val_text, y_val)
            if score > best_model[0]:
                best_model = [score, name, model]
        
        print(f"Trained models on {category}, added {best_model[1]} to list")
        #add best model to list
        category_models[category] = best_model
    return category_models

In [5]:
# SVM
from sklearn.svm import SVC

SVM_search_space={  
                'C': hp.lognormal('C', 0, 1),
                'kernel':hp.choice('kernel', ["linear", "poly", "rbf", "sigmoid"]),
                'coef0':hp.uniform('coef0', 0.0, 1.0),
                'shrinking':hp.choice('shrinking', [True, False]),
                'tol':hp.loguniform('tol', np.log(1e-5), np.log(1e-2)),
                'degree':hp.choice('degree', [1, 2, 3, 4, 5]),
                'gamma':hp.choice('gamma', ["scale", "auto"]),
                }
#KNN
from sklearn.neighbors import KNeighborsClassifier

KNN_search_space={
                "n_neighbors":hp.choice('n_neighbors', np.arange(1, 16, dtype=int)),
                "algorithm":hp.choice("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
                "metric": hp.choice("metric", ["cityblock", "l1", "l2", "minkowski", "euclidean", "manhattan"]),
                "p":hp.uniform("p", 1, 5)
                }

# Logistic Regression
from sklearn.linear_model import LogisticRegression
import warnings

LR_search_space={
                'C': hp.lognormal('C', 0, 1),
                'penalty':hp.choice('p_saga',['elasticnet','l1','l2',None]),
                'tol': hp.loguniform('tol',-13,-1),
                'l1_ratio': hp.uniform('l1_ratio',0,1)
                }

#Random Forest
from sklearn.ensemble import RandomForestClassifier

RF_search_space={  'n_estimators':hp.randint('n_estimators',200,1000),
                'max_depth': hp.randint('max_depth',10,200),                      
                'min_samples_split':hp.uniform('min_samples_split',0,1),   
                'min_samples_leaf':hp.randint('min_samples_leaf',1,10),            
                'criterion':hp.choice('criterion',['gini','entropy']),               
                'max_features':hp.choice('max_features',['sqrt', 'log2']) }

# MLP
from sklearn.neural_network import MLPClassifier

MLP_search_space={
                'activation':hp.choice('activation', ["identity","logistic","tanh","relu"]),
                'solver':hp.choice('solver', ['lbfgs', 'sgd', 'adam']),
                'alpha':hp.uniform("alpha", 1e-4, 0.01),
                'learning_rate':hp.choice('learning_rate', ['constant', 'invscaling', 'adaptive']),
                'learning_rate_init':hp.uniform("learning_rate_init", 1e-4, 0.1),
                'power_t':hp.uniform('power_t', 0.1, 0.9),
                'tol':hp.uniform('tol', 1e-4, 0.01),
                'momentum':hp.uniform('momentum', 0.8, 1.0),
                'early_stopping':hp.choice('early_stopping', [True, False]),
                'beta_1':hp.uniform("beta_1", 0.8, 1.0),
                'beta_2':hp.uniform("beta_2", 0.95, 1.0),
                'epsilon':hp.uniform("epsilon", 1e-9, 1e-5)
                }

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

NB_search_space={
                'var_smoothing': 10**-9
                }

# SGD
from sklearn.linear_model import SGDClassifier

SGD_search_space={
                'loss':hp.choice('loss',["hinge", "log_loss", "modified_huber", "squared_hinge", "perceptron", "squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"]),
                'penalty':hp.choice("penalty", ["l2", "l1", "elasticnet", None]),
                'alpha':hp.loguniform("alpha", np.log(1e-6), np.log(1e-1)),
                "l1_ratio":hp.loguniform("l1_ratio", np.log(1e-7), np.log(1)),
                "tol":hp.loguniform("tol", np.log(1e-5), np.log(1e-2)),
                'learning_rate':hp.choice("learning_rate",  ["optimal", "invscaling", "constant", "adaptive"]),
                'eta0':hp.loguniform("eta0", np.log(1e-5), np.log(1e-1))
                }


In [6]:
model_list_v2 = [
    ("SVM", SVM_search_space, SVC(random_state=42)),
    ("KNN", KNN_search_space, KNeighborsClassifier(n_jobs=-1)),
    ("Logistic Regression", LR_search_space, LogisticRegression(solver="saga", max_iter=1000, random_state=42, n_jobs=-1)),
    ("Random Forest", RF_search_space, RandomForestClassifier()),
    ("MLP", MLP_search_space, MLPClassifier()),
    ("Gaussian NB", NB_search_space, GaussianNB()),
    ("SGD", SGD_search_space, SGDClassifier())
]

In [7]:
pheme = get_dataset("pheme")
X = pheme.drop('target', axis=1)
y = pheme['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
pheme_train = pd.concat([X_train, y_train], axis=1)


In [8]:
def arrange_data(dataset):
    d = get_dataset(dataset)
    X = d.drop('target', axis=1)
    y = d['target']
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
    train_d = pd.concat([X_train, y_train], axis=1)
    val_d = pd.concat([X_val, y_val], axis=1)
    return train_d, val_d

In [9]:
twitter_t, twitter_v = arrange_data("twitter")
weibo_t, weibo_v = arrange_data("weibo")

In [16]:
models = train_models("pheme", pheme_train, 0.5, 5, model_list_v2)

 20%|█▉        | 39/200 [03:54<16:07,  6.01s/trial, best loss: -0.7597254004576659]  
100%|██████████| 200/200 [00:40<00:00,  4.99trial/s, best loss: -0.7574370709382151]
100%|██████████| 200/200 [01:20<00:00,  2.49trial/s, best loss: -0.7665903890160183]
 18%|█▊        | 37/200 [02:00<08:50,  3.26s/trial, best loss: -0.7574370709382151]
100%|██████████| 200/200 [01:08<00:00,  2.91trial/s, best loss: -0.7940503432494279]
100%|██████████| 200/200 [00:01<00:00, 159.30trial/s, best loss: -0.7276887871853547]
100%|██████████| 200/200 [01:09<00:00,  2.87trial/s, best loss: -0.7665903890160183]
Trained models on Sensitive Subjects, added SGD to list
100%|██████████| 200/200 [00:03<00:00, 54.89trial/s, best loss: -0.8035714285714286]
100%|██████████| 200/200 [00:05<00:00, 33.69trial/s, best loss: -0.7857142857142857]
100%|██████████| 200/200 [00:20<00:00,  9.73trial/s, best loss: -0.8214285714285714]
 60%|██████    | 121/200 [02:00<01:18,  1.01trial/s, best loss: -0.75]             
100%|████

In [None]:
models.keys()

dict_keys(['Sensitive Subjects', 'News', 'Arts & Entertainment', 'People & Society', 'Law & Government', 'Online Communities', 'Travel & Transportation'])

In [None]:
twitter = get_dataset("twitter")
twitter_models = train_models("twitter", twitter_t, 0.5, 5, model_list_v2)

100%|██████████| 200/200 [00:03<00:00, 55.15trial/s, best loss: -0.8727272727272727]
100%|██████████| 200/200 [00:04<00:00, 42.63trial/s, best loss: -0.8727272727272727]
100%|██████████| 200/200 [00:53<00:00,  3.76trial/s, best loss: -0.8181818181818182]
 61%|██████    | 122/200 [02:00<01:16,  1.02trial/s, best loss: -0.8363636363636363]
100%|██████████| 200/200 [00:23<00:00,  8.35trial/s, best loss: -0.8909090909090909]
100%|██████████| 200/200 [00:00<00:00, 323.05trial/s, best loss: -0.7636363636363637]
100%|██████████| 200/200 [00:08<00:00, 22.34trial/s, best loss: -0.8727272727272727]
Trained models on People & Society, added KNN to list
100%|██████████| 200/200 [00:06<00:00, 31.43trial/s, best loss: -0.8243243243243243]
100%|██████████| 200/200 [00:06<00:00, 29.65trial/s, best loss: -0.7027027027027027]
100%|██████████| 200/200 [00:59<00:00,  3.35trial/s, best loss: -0.7972972972972973]
 50%|█████     | 101/200 [02:00<01:57,  1.19s/trial, best loss: -0.7432432432432432]
100%|█████

In [None]:
twitter_models

{'People & Society': [0.8727272727272727,
  'KNN',
  KNeighborsClassifier(algorithm='ball_tree', metric='l2', n_jobs=-1,
                       n_neighbors=4, p=4.010199404717891)],
 'Arts & Entertainment': [0.7837837837837838,
  'Logistic Regression',
  LogisticRegression(C=1.1421137890725783, l1_ratio=0.49014451330123276,
                     max_iter=1000, n_jobs=-1, random_state=42, solver='saga',
                     tol=0.0013198619230799782)],
 'Law & Government': [0.821917808219178,
  'KNN',
  KNeighborsClassifier(algorithm='ball_tree', metric='l2', n_jobs=-1,
                       n_neighbors=4, p=4.010199404717891)],
 'News': [0.8378378378378378,
  'KNN',
  KNeighborsClassifier(algorithm='ball_tree', metric='l2', n_jobs=-1,
                       n_neighbors=4, p=4.010199404717891)],
 'Sensitive Subjects': [0.7935483870967742,
  'KNN',
  KNeighborsClassifier(algorithm='ball_tree', metric='l2', n_jobs=-1,
                       n_neighbors=4, p=4.010199404717891)],
 'Online C

In [None]:
weibo = get_dataset("weibo")
weibo_models = train_models("weibo", weibo_t, 0.5, 5, model_list_v2)

100%|██████████| 200/200 [00:09<00:00, 20.29trial/s, best loss: -0.8029197080291971]
100%|██████████| 200/200 [00:28<00:00,  7.06trial/s, best loss: -0.7883211678832117]
100%|██████████| 200/200 [01:04<00:00,  3.12trial/s, best loss: -0.7956204379562044]
 36%|███▌      | 71/200 [02:02<03:43,  1.73s/trial, best loss: -0.8321167883211679]
100%|██████████| 200/200 [00:45<00:00,  4.40trial/s, best loss: -0.8248175182481752]
100%|██████████| 200/200 [00:00<00:00, 268.59trial/s, best loss: -0.635036496350365]
100%|██████████| 200/200 [00:12<00:00, 15.44trial/s, best loss: -0.7956204379562044]
Trained models on Arts & Entertainment, added Random Forest to list
100%|██████████| 200/200 [00:04<00:00, 42.34trial/s, best loss: -0.8953488372093024]
100%|██████████| 200/200 [00:07<00:00, 26.43trial/s, best loss: -0.8837209302325582]
100%|██████████| 200/200 [00:51<00:00,  3.92trial/s, best loss: -0.872093023255814]
 42%|████▏     | 84/200 [02:00<02:46,  1.44s/trial, best loss: -0.8953488372093024]


In [15]:
a = [models, twitter_models, weibo_models]
for k in a:
    with open("optimized_model_parameters.txt", "a") as f:
        for key, value in k.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

In [28]:
for key, value in k.items():
    print(f"\"{key}\", \"{value[1]}\", {value[2]}\n")

"Arts & Entertainment", "SGD", SGDClassifier(alpha=6.041815217486042e-05, eta0=0.03686744686987627,
              l1_ratio=1.762040379810453e-05, learning_rate='constant',
              loss='modified_huber', tol=0.006882240523359017)

"People & Society", "SVM", SVC(C=23.186942440846998, coef0=0.893023010661618, degree=5, kernel='poly',
    random_state=42, shrinking=False, tol=0.006378488923858316)

"Food & Drink", "KNN", KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=9,
                     p=1.75336550705507)

"Health", "SVM", SVC(C=23.186942440846998, coef0=0.893023010661618, degree=5, kernel='poly',
    random_state=42, shrinking=False, tol=0.006378488923858316)

"News", "Logistic Regression", LogisticRegression(C=2.020080503767674, l1_ratio=0.5952769710777521,
                   max_iter=1000, n_jobs=-1, random_state=42, solver='saga',
                   tol=4.926500624596985e-06)

"Sensitive Subjects", "KNN", KNeighborsClassifier(metric='manhattan', n_jobs=-1, n