In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm
import json
import os
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import warnings
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

In [2]:
# initial dataset stuff
nlp = spacy.load("spacy-twitter") # out of function so you don't load it every time (it takes a while)

# function for glove embeddings
def embed_dataset(dataset_text):
    encoded = np.array([nlp(text).vector for text in dataset_text])
    return encoded.tolist()

# function to load dataset from folder. Also embeds the text.
def get_dataset(name):
    """
    loads a dataset and embeds the text. text must be in column named "text".
    datasets are in the folder datasets/
    name must be a string that's matches the csv file in datasets
    """
    dataset = pd.read_csv(f'datasets/{name}.csv')
    dataset.rename(columns = {"Unnamed: 0":"entry"}, inplace=True) #the entry label never carries over
    dataset['e_text'] = embed_dataset(dataset['text'])
    return dataset

In [3]:
def evaluate_model(model, X_test, y_test):
    pred_y = model.predict(X_test)
    acc_mod = accuracy_score(y_test, pred_y)
    print("Accuracy:", float("{0:.2f}".format(acc_mod*100)), "%")
    f1_mod = f1_score(y_test, pred_y, average="macro")
    print("F1:", float("{0:.2f}".format(f1_mod*100)), "%")
    cm = confusion_matrix(y_test, pred_y)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["true", "false"])
    disp.plot()
    plt.show() 
    
def optimize_model_v2(search_space, objective, evals):
    trials = Trials()
    best_params = fmin(
        fn = objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=evals,
        timeout=120,
        trials=trials,
        verbose=False
    )
    set_params = space_eval(search_space, best_params)
    score = trials.best_trial['result']['loss']
    return set_params, score

In [4]:
def train_models(dataset_name, train_set, confidence, size_limit, model_list):
    """
    trains a set of models in each category. returns the best model for each category, in the form {'category': [modelscore, modelname, fittedmodel]}

    dataset_name: a string with the name of the training set. used for calling the category file
    train_set: the training set to use
    confidence: the confidence required to consider an entry part of a category
    size_limit: the number of entries needed in a category to consider that category for training
    model_list: the list of models to train. in the form [("model_name1", model1), etc]
    """
    file_name = f"{dataset_name}_cats/{dataset_name}_categories_organised.json"
    f = open(file_name)
    data = json.load(f)
    f.close()
    category_models = {} #this will be returned
    all_models = {}
    for category in data.keys(): 
        cat_entries = [int(i) for i in data[category].keys() if data[category][i] > confidence]
        
        # skip category if size of category is below limit
        if len(cat_entries) < size_limit:
            print(f"Skipped category: {category} due to low numbers")
            continue
        
        category_data = train_set.filter(axis=0, items=cat_entries)

        #split validation set
        X = category_data.drop('target', axis=1)
        y = category_data["target"]
        try:
            X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
        except:
            print(f"Skipped category: {category} due to class issues")
            continue

        # skip category if split only has one class
        if (len(np.unique(y_train)) <= 1):
            print(f"Skipped category: {category} due to class issues")
            continue

        X_train_text = np.array([text for text in X_train['e_text']])
        X_val_text = np.array([text for text in X_val['e_text']])

        trained_models = []
        all_models[category] = []
        # train models from list
        for model_name, search_space, mod in model_list:
            def objective(search_space):
                warnings.filterwarnings('ignore')
                model = mod.set_params(**search_space)
                model.fit(X_train_text, y_train)
                y_pred = model.predict(X_val_text)
                accuracy = accuracy_score(y_val, y_pred)
                return {'loss': -accuracy, 'status': STATUS_OK}
            try:
                best_params, score = optimize_model_v2(search_space, objective, 200)
                mod.set_params(**best_params)
                score *= -1
                trained_models.append((model_name, mod))
                all_models[category].append((model_name, mod))
                #print(f"Trained {model_name} on {category}")
            except:
                print(f"Error training {model_name} in category {category}, skipping")
                continue
            

        #get the best model
        best_model = [0, "x", "x"]
        for name, model in trained_models:
            score = model.score(X_val_text, y_val)
            if score > best_model[0]:
                best_model = [score, name, model]
        
        print(f"Trained models on {category}, added {best_model[1]} to list")
        #add best model to list
        category_models[category] = best_model
    return category_models, all_models

In [5]:
def baseline(train_set, model_list):
    X = train_set.drop('target', axis=1)
    y = train_set["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.75, random_state=42, stratify=y_train)
    X_train_text = np.array([text for text in X_train['e_text']])
    X_val_text = np.array([text for text in X_val['e_text']])
    trained_models = []
    for model_name, search_space, mod in model_list:
        def objective(search_space):
            warnings.filterwarnings('ignore')
            model = mod.set_params(**search_space)
            model.fit(X_train_text, y_train)
            y_pred = model.predict(X_val_text)
            accuracy = accuracy_score(y_val, y_pred)
            return {'loss': -accuracy, 'status': STATUS_OK}
        try:
            best_params, score = optimize_model_v2(search_space, objective, 200)
            mod.set_params(**best_params)
            score *= -1
            trained_models.append((model_name, mod))
            #print(f"Trained {model_name} on {category}")
        except:
            print(f"Error training {model_name}, skipping")
            continue
    return trained_models
    

In [6]:
# SVM
from sklearn.svm import SVC

SVM_search_space={  
                'C': hp.lognormal('C', 0, 1),
                'kernel':hp.choice('kernel', ["linear", "poly", "rbf", "sigmoid"]),
                'coef0':hp.uniform('coef0', 0.0, 1.0),
                'shrinking':hp.choice('shrinking', [True, False]),
                'tol':hp.loguniform('tol', np.log(1e-5), np.log(1e-2)),
                'degree':hp.choice('degree', [1, 2, 3, 4, 5]),
                'gamma':hp.choice('gamma', ["scale", "auto"]),
                }
#KNN
from sklearn.neighbors import KNeighborsClassifier

KNN_search_space={
                "n_neighbors":hp.choice('n_neighbors', np.arange(1, 16, dtype=int)),
                "algorithm":hp.choice("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
                "metric": hp.choice("metric", ["cityblock", "l1", "l2", "minkowski", "euclidean", "manhattan"]),
                "p":hp.uniform("p", 1, 5)
                }

# Logistic Regression
from sklearn.linear_model import LogisticRegression
import warnings

LR_search_space={
                'C': hp.lognormal('C', 0, 1),
                'penalty':hp.choice('p_saga',['elasticnet','l1','l2',None]),
                'tol': hp.loguniform('tol',-13,-1),
                'l1_ratio': hp.uniform('l1_ratio',0,1)
                }

#Random Forest
from sklearn.ensemble import RandomForestClassifier

RF_search_space={  'n_estimators':hp.randint('n_estimators',200,1000),
                'max_depth': hp.randint('max_depth',10,200),                      
                'min_samples_split':hp.uniform('min_samples_split',0,1),   
                'min_samples_leaf':hp.randint('min_samples_leaf',1,10),            
                'criterion':hp.choice('criterion',['gini','entropy']),               
                'max_features':hp.choice('max_features',['sqrt', 'log2']) }

# MLP
from sklearn.neural_network import MLPClassifier

MLP_search_space={
                'activation':hp.choice('activation', ["identity","logistic","tanh","relu"]),
                'solver':hp.choice('solver', ['lbfgs', 'sgd', 'adam']),
                'alpha':hp.uniform("alpha", 1e-4, 0.01),
                'learning_rate':hp.choice('learning_rate', ['constant', 'invscaling', 'adaptive']),
                'learning_rate_init':hp.uniform("learning_rate_init", 1e-4, 0.1),
                'power_t':hp.uniform('power_t', 0.1, 0.9),
                'tol':hp.uniform('tol', 1e-4, 0.01),
                'momentum':hp.uniform('momentum', 0.8, 1.0),
                'early_stopping':hp.choice('early_stopping', [True, False]),
                'beta_1':hp.uniform("beta_1", 0.8, 1.0),
                'beta_2':hp.uniform("beta_2", 0.95, 1.0),
                'epsilon':hp.uniform("epsilon", 1e-9, 1e-5)
                }

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

NB_search_space={
                'var_smoothing': 10**-9
                }

# SGD
from sklearn.linear_model import SGDClassifier

SGD_search_space={
                'loss':hp.choice('loss',["hinge", "log_loss", "modified_huber", "squared_hinge", "perceptron", "squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"]),
                'penalty':hp.choice("penalty", ["l2", "l1", "elasticnet", None]),
                'alpha':hp.loguniform("alpha", np.log(1e-6), np.log(1e-1)),
                "l1_ratio":hp.loguniform("l1_ratio", np.log(1e-7), np.log(1)),
                "tol":hp.loguniform("tol", np.log(1e-5), np.log(1e-2)),
                'learning_rate':hp.choice("learning_rate",  ["optimal", "invscaling", "constant", "adaptive"]),
                'eta0':hp.loguniform("eta0", np.log(1e-5), np.log(1e-1))
                }


In [7]:
model_list_v2 = [
    ("SVM", SVM_search_space, SVC(random_state=42)),
    ("KNN", KNN_search_space, KNeighborsClassifier(n_jobs=-1)),
    ("Logistic Regression", LR_search_space, LogisticRegression(solver="saga", max_iter=1000, random_state=42, n_jobs=-1)),
    ("Random Forest", RF_search_space, RandomForestClassifier()),
    ("MLP", MLP_search_space, MLPClassifier()),
    ("Gaussian NB", NB_search_space, GaussianNB()),
    ("SGD", SGD_search_space, SGDClassifier())
]

In [8]:
pheme = get_dataset("pheme")
X = pheme.drop('target', axis=1)
y = pheme['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
pheme_train = pd.concat([X_train, y_train], axis=1)


In [9]:
def arrange_data(dataset):
    d = get_dataset(dataset)
    X = d.drop('target', axis=1)
    y = d['target']
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
    train_d = pd.concat([X_train, y_train], axis=1)
    val_d = pd.concat([X_val, y_val], axis=1)
    return train_d, val_d

In [10]:
#a = baseline(pheme, model_list_v2)

In [11]:
twitter = get_dataset("twitter")
#b = baseline(twitter, model_list_v2)

In [12]:
weibo = get_dataset("weibo")
#c = baseline(weibo, model_list_v2)

In [13]:
twitter_t, twitter_v = arrange_data("twitter")
weibo_t, weibo_v = arrange_data("weibo")

In [10]:
models, all_models = train_models("pheme", pheme_train, 0.3, 50, model_list_v2)

 14%|█▎        | 27/200 [00:23<02:51,  1.01trial/s, best loss: -0.7873651771956857]

In [None]:
models.keys()

In [None]:
twitter = get_dataset("twitter")
twitter_models, all_twitter_models = train_models("twitter", twitter_t, 0.3, 50, model_list_v2)

In [26]:
twitter_models

NameError: name 'twitter_models' is not defined

In [None]:
weibo = get_dataset("weibo")
weibo_models, all_weibo_models = train_models("weibo", weibo_t, 0.3, 50, model_list_v2)

In [None]:
a = [models, twitter_models, weibo_models]
for k in a:
    with open("optimized_model_parameters.txt", "a") as f:
        for key, value in k.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

In [10]:
pheme_models3, all_pheme_models3 = train_models("pheme", pheme_train, 0, 100, model_list_v2)
with open("optimized_model_parameters.txt", "a") as f:
        for key, value in pheme_models3.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")
twitter_models3, all_twitter_models3 = train_models("twitter", twitter_t, 0, 100, model_list_v2)
with open("optimized_model_parameters.txt", "a") as f:
        for key, value in twitter_models3.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")
weibo_models3, all_weibo_models3 = train_models("weibo", weibo_t, 0, 100, model_list_v2)
with open("optimized_model_parameters.txt", "a") as f:
        for key, value in weibo_models3.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

 42%|████▎     | 85/200 [02:00<02:42,  1.42s/trial, best loss: -0.8196125907990315]
100%|██████████| 200/200 [01:25<00:00,  2.33trial/s, best loss: -0.7941888619854721]
 53%|█████▎    | 106/200 [02:00<01:46,  1.14s/trial, best loss: -0.7772397094430993]
 12%|█▏        | 24/200 [02:02<15:01,  5.12s/trial, best loss: -0.7699757869249395]
 68%|██████▊   | 137/200 [02:00<00:55,  1.14trial/s, best loss: -0.8280871670702179]
100%|██████████| 200/200 [00:02<00:00, 86.02trial/s, best loss: -0.7336561743341404]
100%|██████████| 200/200 [01:43<00:00,  1.94trial/s, best loss: -0.7784503631961259]
Trained models on Sensitive Subjects, added SVM to list
 44%|████▍     | 88/200 [04:05<05:12,  2.79s/trial, best loss: -0.8403614457831325]  
100%|██████████| 200/200 [01:12<00:00,  2.75trial/s, best loss: -0.802710843373494]
100%|██████████| 200/200 [01:51<00:00,  1.79trial/s, best loss: -0.786144578313253]
 11%|█         | 22/200 [02:04<16:49,  5.67s/trial, best loss: -0.7876506024096386]
100%|████████

In [15]:
confidence_list = [0.1]

for confidence in confidence_list:
    with open("model_parameters_by_c.txt", "a") as f:
        f.write(f"Weibo Models with {confidence} confidence and 50 size\n\n")

    """pheme_models, all_pheme_models = train_models("pheme", pheme_train, confidence, 50, model_list_v2)
    with open("model_parameters_by_c.txt", "a") as f:
        f.write("PHEME")
        for key, value in pheme_models.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")"""

    """twitter_models, all_twitter_models = train_models("twitter", twitter_t, confidence, 50, model_list_v2)
    with open("model_parameters_by_c.txt", "a") as f:
        f.write("Twitter")
        for key, value in twitter_models.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")"""
    
    weibo_models, all_weibo_models = train_models("weibo", weibo_t, confidence, 50, model_list_v2)
    with open("model_parameters_by_c.txt", "a") as f:
        f.write("Weibo")
        for key, value in weibo_models.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

Trained models on Arts & Entertainment, added SVM to list
Trained models on People & Society, added Logistic Regression to list
Trained models on Reference, added KNN to list
Trained models on Food & Drink, added Logistic Regression to list
Trained models on Sports, added SVM to list
Trained models on Games, added KNN to list
Trained models on Travel & Transportation, added Logistic Regression to list
Trained models on Health, added SGD to list
Trained models on Online Communities, added SVM to list
Trained models on News, added SVM to list
Trained models on Science, added KNN to list
Trained models on Sensitive Subjects, added KNN to list
Trained models on Shopping, added Logistic Regression to list
Trained models on Finance, added KNN to list
Skipped category: Real Estate due to low numbers
Trained models on Jobs & Education, added KNN to list
Trained models on Law & Government, added SVM to list
Trained models on Business & Industrial, added Random Forest to list
Trained models on C

In [20]:
size_list = [20, 100, 150, 200]

for size in size_list:
    with open("model_parameters_by_c.txt", "a") as f:
        f.write(f"Weibo Models with 0.3 confidence and {size} size\n\n")

    pheme_models, all_pheme_models = train_models("weibo", weibo, 0.3, size, model_list_v2)
    with open("model_parameters_by_c.txt", "a") as f:
        f.write("Weibo")
        for key, value in pheme_models.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

In [18]:
for size in size_list:
    with open("model_parameters_by_c.txt", "a") as f:
        f.write(f"Weibo Models with 0.2 confidence and {size} size\n\n")

    pheme_models, all_pheme_models = train_models("weibo", weibo, 0.2, size, model_list_v2)
    with open("model_parameters_by_c.txt", "a") as f:
        f.write("Weibo")
        for key, value in pheme_models.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

Trained models on Arts & Entertainment, added Random Forest to list
Trained models on People & Society, added MLP to list
Trained models on Reference, added KNN to list
Trained models on Food & Drink, added KNN to list
Trained models on Sports, added MLP to list
Trained models on Games, added KNN to list
Trained models on Travel & Transportation, added MLP to list
Trained models on Health, added SGD to list
Trained models on Online Communities, added Logistic Regression to list
Trained models on News, added Logistic Regression to list
Trained models on Science, added Logistic Regression to list
Trained models on Sensitive Subjects, added KNN to list
Trained models on Shopping, added KNN to list
Trained models on Finance, added SVM to list
Skipped category: Real Estate due to low numbers
Trained models on Jobs & Education, added KNN to list
Trained models on Law & Government, added MLP to list
Trained models on Business & Industrial, added SVM to list
Trained models on Computers & Elect

In [19]:
with open("model_parameters_by_c.txt", "a") as f:
    f.write(f"Models with 0.2 confidence and 150 size\n\n")

twitter_models, all_twitter_models = train_models("twitter", twitter_t, 0.2, 150, model_list_v2)
with open("model_parameters_by_c.txt", "a") as f:
        f.write("Twitter\n")
        for key, value in twitter_models.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

weibo_models, all_weibo_models = train_models("weibo", weibo_t, 0.2, 150, model_list_v2)
with open("model_parameters_by_c.txt", "a") as f:
        f.write("Weibo\n")
        for key, value in weibo_models.items():
            f.write(f"(\"{key}\", \"{value[1]}\", {value[2]}),\n")
        f.write("\n")

Trained models on People & Society, added SVM to list
Trained models on Arts & Entertainment, added MLP to list
Trained models on Law & Government, added KNN to list
Trained models on News, added KNN to list
Skipped category: Food & Drink due to low numbers
Trained models on Sensitive Subjects, added KNN to list
Trained models on Online Communities, added KNN to list
Skipped category: Internet & Telecom due to low numbers
Skipped category: Computers & Electronics due to low numbers
Skipped category: Health due to low numbers
Skipped category: Pets & Animals due to low numbers
Skipped category: Reference due to low numbers
Skipped category: Adult due to low numbers
Skipped category: Business & Industrial due to low numbers
Skipped category: Books & Literature due to low numbers
Skipped category: Jobs & Education due to low numbers
Skipped category: Shopping due to low numbers
Skipped category: Beauty & Fitness due to low numbers
Skipped category: Autos & Vehicles due to low numbers
Skip

In [None]:
pheme_models1, all_pheme_models1 = train_models("pheme", pheme_train, 0.2, 200, model_list_v2)
pheme_models2, all_pheme_models2 = train_models("pheme", pheme_train, 0.5, 5, model_list_v2)
pheme_models4, all_pheme_models4 = train_models("pheme", pheme_train, 0.3, 50, model_list_v2)

a = [all_pheme_models1, all_pheme_models2, all_pheme_models3, all_pheme_models4]
for k in a:
    with open("optimized_model_parameters.txt", "a") as f:
        f.write("\nPHEME full model set\n")
        for key, value in k.items():
            f.write(f"(\"{key}\", {value}),\n")
            f.write("\n")

 92%|█████████▎| 185/200 [01:12<00:05,  2.57trial/s, best loss: -0.7750677506775068]
Error training Logistic Regression in category Sensitive Subjects, skipping
  4%|▍         | 8/200 [00:31<17:02,  5.32s/trial, best loss: -0.7249322493224932]

In [None]:
twitter_models1, all_twitter_models1 = train_models("twitter", twitter_t, 0.2, 200, model_list_v2)
twitter_models2, all_twitter_models2 = train_models("twitter", twitter_t, 0.5, 5, model_list_v2)
twitter_models4, all_twitter_models4 = train_models("twitter", twitter_t, 0.3, 50, model_list_v2)

a = [all_twitter_models1, all_twitter_models2, all_twitter_models3, all_twitter_models4]
for k in a:
    with open("optimized_model_parameters.txt", "a") as f:
        f.write("\nTwitter full model set\n")
        for key, value in k.items():
            f.write(f"(\"{key}\", {value}),\n")
            f.write("\n")

In [None]:
weibo_models1, all_weibo_models1 = train_models("weibo", weibo_t, 0.2, 200, model_list_v2)
weibo_models2, all_weibo_models2 = train_models("weibo", weibo_t, 0.5, 5, model_list_v2)
weibo_models4, all_weibo_models4 = train_models("weibo", weibo_t, 0.3, 50, model_list_v2)

a = [all_weibo_models1, all_weibo_models2, all_weibo_models3, all_weibo_models4]
for k in a:
    with open("optimized_model_parameters.txt", "a") as f:
        f.write("\nTwitter full model set\n")
        for key, value in k.items():
            f.write(f"(\"{key}\", {value}),\n")
            f.write("\n")