# Dynamically weighted voting model for Rumour Detection
Run all of the cells in order.

## Import packages

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm
import json
import os
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
import warnings

## Prep datasets

In [8]:
# initial dataset stuff
nlp = spacy.load("spacy-twitter") # out of function so you don't load it every time (it takes a while)

# function for glove embeddings
def embed_dataset(dataset_text):
    encoded = np.array([nlp(text).vector for text in dataset_text])
    return encoded.tolist()

# function to load dataset from folder. Also embeds the text.
def get_dataset(name):
    """
    loads a dataset and embeds the text. text must be in column named "text".
    datasets are in the folder datasets/
    name must be a string that's matches the csv file in datasets
    """
    dataset = pd.read_csv(f'datasets/{name}.csv')
    dataset.rename(columns = {"Unnamed: 0":"entry"}, inplace=True) #the entry label never carries over
    dataset['e_text'] = embed_dataset(dataset['text'])
    return dataset

## Optimize and Train Models

In [9]:
def get_best_params(search_space, objective, evals):
    trials = Trials()
    best_params = fmin(
        fn = objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=evals,
        timeout=120,
        trials=trials,
        verbose=False
    )
    set_params = space_eval(search_space, best_params)
    score = trials.best_trial['result']['loss']
    return set_params, score

In [10]:
def train_models(dataset_name, train_set, confidence, size_limit, model_list):
    """
    trains a set of pre-optimised models in each category. returns the best model for each category, in the form {'category': [modelscore, modelname, fittedmodel]}

    dataset_name: a string with the name of the training set. used for calling the category file
    train_set: the training set to use
    confidence: the confidence required to consider an entry part of a category
    size_limit: the number of entries needed in a category to consider that category for training
    model_list: the list of models to train. in the form [("category", "model_name1", model1), etc]
    """
    file_name = f"{dataset_name}_cats/{dataset_name}_categories_organised.json"
    f = open(file_name)
    data = json.load(f)
    f.close()
    category_models = {} #this will be returned
    warnings.filterwarnings('ignore')
    for category, model_name, model in model_list:
        cat_entries = [int(i) for i in data[category].keys() if data[category][i] > confidence]
        
        # skip category if size of category is below limit
        if len(cat_entries) < size_limit:
            print(f"Skipped category: {category} due to low numbers")
            continue
        
        category_data = train_set.filter(axis=0, items=cat_entries)

        #split validation set
        X = category_data.drop('target', axis=1)
        y = category_data["target"]
        try:
            X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
        except:
            print(f"Skipped category: {category} due to class issues")
            continue

        # skip category if split only has one class
        if (len(np.unique(y_train)) <= 1):
            print(f"Skipped category: {category} due to class issues")
            continue

        X_train_text = np.array([text for text in X_train['e_text']])
        X_val_text = np.array([text for text in X_val['e_text']])

        model.fit(X_train_text, y_train)
        
        #print(f"Trained models on {category}, added {model_name} to list")
        #add best model to list
        category_models[category] = (0, model_name, model)
    print("training complete")
    return category_models

## Voting

In [11]:
def predict_points(trained_models, test_cat_file, X_test):
    """
    predict points using the trained models. returns an array of the predictions

    trained_models: the models trained in each category, in the form {category: [modelscore, modelname, fittedmodel]}
    test_cat_file: the filepath to the organised category file
    X_test: the test set X values
    """
    #load category data
    f = open(test_cat_file)
    category_data = json.load(f)
    f.close()

    # return arrays
    final_predictions = []
    all_predictions = []
    
    #embedded_text = np.array([text for text in X_test['e_text']])
    for i in tqdm(range(len(X_test))):
        test_point = X_test.iloc[i]
        point_text = np.array(test_point['e_text'])
        point_categories = category_data[str(test_point["entry"])]

        # get weights of each point's topic
        topic_weights = {}
        for category in point_categories:
            main_category = category.split("/")[1]
            if main_category not in trained_models.keys():
                continue
            if main_category in topic_weights:
                topic_weights[main_category] += point_categories[category]
            else:
                topic_weights[main_category] = point_categories[category]
        
        #make topic predictions
        model_predictions = []
        for category in topic_weights.keys():
            modelscore, modelname, model = trained_models[category]
            prediction = model.predict(point_text.reshape(1,-1))
            score = topic_weights[category]
            model_predictions.append((prediction[0], score, modelname, category))
        all_predictions.append(model_predictions)
        # aggregate predictions
        truefalse_scores = {True: 0, False:0}
        for prediction, modelscore, modelname, category in model_predictions:
            truefalse_scores[prediction] += modelscore
        
        #determine final prediction
        if truefalse_scores[True] > truefalse_scores[False]:
            final_predictions.append(True)
        else:
            final_predictions.append(False)
    return final_predictions, all_predictions

# Prepare Classifiers

In [12]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [23]:
# An example list of optimized models. To optimize another set, run the "model_optimizer.ipynb" notebook, which saves optimized models in the "optimized_models.txt" file
# Confidence = 0.2, Size = 150
pheme_list_1 = [("Sensitive Subjects", "MLP", MLPClassifier(activation='tanh', alpha=0.00466175275671903,
              beta_1=0.9510819155734616, beta_2=0.9731437463781671,
              early_stopping=True, epsilon=6.182058335753729e-06,
              learning_rate_init=0.0376602230989045,
              momentum=0.8971674634596085, power_t=0.27369760346373595,
              solver='lbfgs', tol=0.003014654999289671)),
("News", "SVM", SVC(C=1.2545402078985404, coef0=0.6812586058490248, degree=5, kernel='linear',
    random_state=42, tol=0.002011238465192678)),
("Arts & Entertainment", "SVM", SVC(C=1.2545402078985404, coef0=0.6812586058490248, degree=5, kernel='linear',
    random_state=42, tol=0.002011238465192678)),
("People & Society", "KNN", KNeighborsClassifier(algorithm='brute', metric='euclidean', n_jobs=-1,
                     n_neighbors=1, p=3.000450775121091)),
("Law & Government", "MLP", MLPClassifier(activation='tanh', alpha=0.00466175275671903,
              beta_1=0.9510819155734616, beta_2=0.9731437463781671,
              early_stopping=True, epsilon=6.182058335753729e-06,
              learning_rate_init=0.0376602230989045,
              momentum=0.8971674634596085, power_t=0.27369760346373595,
              solver='lbfgs', tol=0.003014654999289671)),
("Online Communities", "SVM", SVC(C=1.2545402078985404, coef0=0.6812586058490248, degree=5, kernel='linear',
    random_state=42, tol=0.002011238465192678)),
("Travel & Transportation", "SVM", SVC(C=1.2545402078985404, coef0=0.6812586058490248, degree=5, kernel='linear',
    random_state=42, tol=0.002011238465192678))]

twitter_list_1 = [("People & Society", "SVM", SVC(C=1.2001176811044414, coef0=0.5565869984730393, degree=2, random_state=42,
    shrinking=False, tol=0.00115409246408022)),
("Arts & Entertainment", "MLP", MLPClassifier(activation='identity', alpha=0.004117964960327768,
              beta_1=0.9307677109652692, beta_2=0.9950328825362118,
              early_stopping=True, epsilon=3.670832286279156e-06,
              learning_rate_init=0.08474815432542147,
              momentum=0.8489053845985096, power_t=0.3161632828087111,
              tol=0.003998948219832486)),
("Law & Government", "KNN", KNeighborsClassifier(algorithm='kd_tree', metric='l2', n_jobs=-1, n_neighbors=4,
                     p=1.111575850498425)),
("News", "KNN", KNeighborsClassifier(algorithm='kd_tree', metric='l2', n_jobs=-1, n_neighbors=4,
                     p=1.111575850498425)),
("Sensitive Subjects", "KNN", KNeighborsClassifier(algorithm='kd_tree', metric='l2', n_jobs=-1, n_neighbors=4,
                     p=1.111575850498425)),
("Online Communities", "KNN", KNeighborsClassifier(algorithm='kd_tree', metric='l2', n_jobs=-1, n_neighbors=4,
                     p=1.111575850498425))]

weibo_list_1 = [("Arts & Entertainment", "SVM", SVC(C=1.0321831404502073, coef0=0.6180534656634313, degree=5, kernel='poly',
    random_state=42, shrinking=False, tol=9.683373991170909e-05)),
("People & Society", "KNN", KNeighborsClassifier(metric='l2', n_jobs=-1, n_neighbors=1, p=3.439128442811229)),
("Food & Drink", "KNN", KNeighborsClassifier(metric='l2', n_jobs=-1, n_neighbors=1, p=3.439128442811229)),
("Travel & Transportation", "KNN", KNeighborsClassifier(metric='l2', n_jobs=-1, n_neighbors=1, p=3.439128442811229)),
("Health", "KNN", KNeighborsClassifier(metric='l2', n_jobs=-1, n_neighbors=1, p=3.439128442811229)),
("News", "SVM", SVC(C=1.0321831404502073, coef0=0.6180534656634313, degree=5, kernel='poly',
    random_state=42, shrinking=False, tol=9.683373991170909e-05)),
("Sensitive Subjects", "KNN", KNeighborsClassifier(metric='l2', n_jobs=-1, n_neighbors=1, p=3.439128442811229)),
("Law & Government", "Logistic Regression", LogisticRegression(C=1.333331807058486, l1_ratio=0.3290616566255845,
                   max_iter=1000, n_jobs=-1, random_state=42, solver='saga',
                   tol=6.797644761695533e-05)),
("Business & Industrial", "KNN", KNeighborsClassifier(metric='l2', n_jobs=-1, n_neighbors=1, p=3.439128442811229)),
("Autos & Vehicles", "KNN", KNeighborsClassifier(metric='l2', n_jobs=-1, n_neighbors=1, p=3.439128442811229))]

## Evaluate Results

In [14]:
def check_score(test, pred, confusion=False):
    """
    Returns the accuract and F1 of a model. Set confusion = True to see the confusion matrix.
    """
    acc = accuracy_score(test, pred)
    f1 = f1_score(test, pred, average="macro")
    if confusion == True:
        cm = confusion_matrix(test, pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["true", "false"])
        disp.plot()
        plt.show() 
    return float("{0:.2f}".format(acc*100)), float("{0:.2f}".format(f1*100))

# Run Tests

In [15]:
pheme = get_dataset("pheme")
twitter = get_dataset("twitter")
weibo = get_dataset("weibo")

pheme_val = pheme.drop('target', axis=1)
pheme_val_text = np.array([text for text in pheme_val['e_text']])
pheme_target = pheme['target']
twitter_val = twitter.drop('target', axis=1)
twitter_val_text = np.array([text for text in twitter_val['e_text']])
twitter_target = twitter['target']
weibo_val = weibo.drop('target', axis=1)
weibo_val_text = np.array([text for text in weibo_val['e_text']])
weibo_target = weibo['target']

def run_tests(training, training_name, confidence, size, model_list):
    X = training.drop("target", axis=1)
    y = training["target"]
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42) 
    models = train_models(training_name, training, confidence, size, model_list)
    category_file = f"{training_name}_categories.json"
    val_pred = predict_points(models, category_file, X_val)
    val_score = check_score(y_val, val_pred[0])

    pheme_pred = predict_points(models, "pheme_categories.json", pheme_val)
    pheme_score = check_score(pheme_target, pheme_pred[0])
    twitter_pred = predict_points(models, "twitter_categories.json", twitter_val)
    twitter_score = check_score(twitter_target, twitter_pred[0])
    weibo_pred = predict_points(models, "weibo_categories.json", weibo_val)
    weibo_score = check_score(weibo_target, weibo_pred[0])

    return val_score, pheme_score, twitter_score, weibo_score

In [22]:
conf = 0.2
size = 150
dataset = pheme
dataset_name = "pheme"
models = pheme_list_1

results = run_tests(dataset, dataset_name, conf, size, models)
print(f"Scores on {conf} and {size}:")
print(f"{dataset_name} Validation:\n", "Accuracy: ", results[0][0], "F1: ", results[0][1])
print("PHEME Test Results:\n", "Accuracy:", results[1][0], "F1:", results[1][1])
print("Twitter Test Results:\n", "Accuracy:", results[2][0], "F1:", results[2][1])
print("Weibo Test Results:\n", "Accuracy:", results[3][0], "F1:", results[3][1])

training complete


100%|██████████| 1285/1285 [00:02<00:00, 525.50it/s]
100%|██████████| 6425/6425 [00:12<00:00, 529.55it/s] 
100%|██████████| 2308/2308 [00:03<00:00, 609.00it/s]
100%|██████████| 4664/4664 [00:05<00:00, 891.94it/s] 

Scores on 0.2 and 150:
pheme Validation:
 Accuracy:  83.81 F1:  82.43
PHEME Test Results:
 Accuracy: 82.8 F1: 81.36
Twitter Test Results:
 Accuracy: 57.02 F1: 55.67
Weibo Test Results:
 Accuracy: 48.56 F1: 43.85



