In [15]:
from flair.embeddings import FlairEmbeddings, Sentence
from flair.embeddings import DocumentPoolEmbeddings
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
import numpy as np
import csv
import time
import re

# FLAIR: https://github.com/zalandoresearch/flair
# LLPOF: https://github.com/thiagorainmaker77/liar_dataset
# 0 == true class, 1 == fake class

In [49]:
# Importing and formatting the data
raw_data = []
raw_labels = []
with open("data/data.tsv", encoding="utf8") as input_file:
    try:
        for row in csv.reader(input_file, delimiter="\t"):
            if row[1] in ["true", "mostly-true", "half-true"]:
                train_label = 0
            else:
                train_label = 1

            train_statement = row[2]
            #train_statement = re.sub(r'[^\w\s]', '', train_statement)

            raw_data.append(train_statement)
            raw_labels.append(train_label)
    except:
        print("A train line was skipped due to an error")

raw_data = np.asarray(raw_data)
raw_labels = np.asarray(raw_labels)

['Says the Annies List political group supports third-trimester abortions on demand.'
 'When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.'
 'Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."'
 'Health care reform legislation is likely to mandate free sex change surgeries.'
 'The economic turnaround started at the end of my term.'
 'The Chicago Bears have had more starting quarterbacks in the last 10 years than the total number of tenured (UW) faculty fired during the last two decades.'
 'Jim Dunnam has not lived in the district he represents for years now.'
 "I'm the only person on this stage who has worked actively just last year passing, along with Russ Feingold, some of the toughest ethics reform since Watergate."
 'However, it took $19.5 million in Oregon Lottery funds for the Port of Newport to eventually land the new NOAA Marine Opera

In [50]:
# Getting embeddings
raw_embeddings = []

# initialize the word embeddings
flair_embedding_forward = FlairEmbeddings('news-forward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward])

total_start_time = time.time()
timestamps = []
total_embeddings = len(raw_data)
progress_bar_width = 80
progress_iterations = total_embeddings

# Iterate over the batches and create embeddings
for i in range(0, total_embeddings):
    if i == 0:
        print(f"\nEmbedding {total_embeddings} sentences")

    start_time = time.time()

    # Training embeddings
    document_embeddings.embed(Sentence(raw_data[i]))
    raw_embeddings += [np.array(
        Sentence(raw_data[i]).get_embedding().detach())]

    elapsed_time = time.time() - start_time
    timestamps.append(elapsed_time)
    
    remaining = sum(timestamps) / len(timestamps)
    remaining = remaining * (total_embeddings - i + 1)
    remaining = round(remaining / 60, 1)

    hash_count = round(progress_bar_width * (i/progress_iterations))
    hashes = '#' * hash_count
    spaces = ' ' * (progress_bar_width - hash_count)

    print(f"\rRemaining time: {remaining} min [{hashes}{spaces}]",
          end='')

total_elapsed_time = time.time() - total_start_time
raw_embeddings = np.asarray(raw_embeddings)

print(f"\nEmbedding completed. Total duration:",
      round(total_elapsed_time / 60, 1),
      "min")

[0. 0. 0.]

Embedding 19 sentences
Remaining time: 0.1 min [#############                                                                   ][array([], dtype=float32)]
Remaining time: 0.0 min [############################################################################    ][array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32), array([], dtype=float32)]

Embedding completed. Total duration: 0.1 min


In [40]:
# Evaluation
def evaluate(true, predicted):
    decimals = 3

    accuracy = round(metrics.accuracy_score(true, predicted), decimals)
    precision = round(metrics.precision_score(true, predicted), decimals)
    recall = round(metrics.recall_score(true, predicted), decimals)
    f1 = round(metrics.f1_score(true, predicted), decimals)

    confusion = metrics.confusion_matrix(true, predicted, labels=[1, 0])
    return [["accuracy: ", accuracy],
            ["precision: ", precision],
            ["recall: ", recall],
            ["f1: ", f1],
            ["confusion matrix: ", confusion]]

In [51]:
sample_size = 10
total_start_time = time.time()
timestamps = []
progress_bar_width = 80
progress_iterations = sample_size
save_location = 'resultss.csv'

for i in range(0, sample_size):
    if i == 0:
        print(f"Starting {sample_size} predictions for each treatment")
        print(f"Results are stored in {save_location}")
    
    start_time = time.time()
    
    # Fetching train and test splits
    shufflesplit = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=round(len(raw_data)*0.2))
    
    samples = shufflesplit.split(raw_embeddings, raw_labels)
   
    for train_index, test_index in samples:
        train_embeddings = raw_embeddings[train_index]
        test_embeddings = raw_embeddings[test_index]
        
        train_data = np.array(raw_data[train_index])
        test_data = np.array(raw_data[test_index])
        
        train_labels = raw_labels[train_index]
        test_labels = raw_labels[test_index]


    # Create TF-IDF DTM for baselines
    # Train
    count_vect = CountVectorizer()
    train_counts = count_vect.fit_transform(train_data)

    tfidf_transformer = TfidfTransformer()
    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    dense_train_tfidf = train_tfidf.toarray()

    # Test
    test_counts = count_vect.transform(test_data)
    test_tfidf = tfidf_transformer.transform(test_counts)
    dense_test_tfidf = test_tfidf.toarray()

    # Baselines + embeddings
    # Classification tree
    tree_md5 = DecisionTreeClassifier(max_depth=5, random_state=0)
    tree_md5.fit(train_tfidf, train_labels)
    tree_md5_eval = evaluate(test_labels, tree_md5.predict(test_tfidf))
    #print("\nTree md5:", *tree_md5_eval, sep="\n")

    # Gaussian NB
    gnb = GaussianNB()
    gnb.fit(dense_train_tfidf, train_labels)
    gnb_eval = evaluate(test_labels, gnb.predict(dense_test_tfidf))
    #print("\nGaussian NB:", *gnb_eval, sep="\n")

    # Logistic Regression
    lsr = LogisticRegression(solver="lbfgs")
    lsr.fit(train_tfidf, train_labels)
    lsr_eval = evaluate(test_labels, lsr.predict(test_tfidf))
    #print("\nLogistic Regression:", *lsr_eval, sep="\n")

    # SVM Linear
    svm = SVC(kernel='linear')
    svm.fit(train_tfidf, train_labels)
    svm_eval = evaluate(test_labels, svm.predict(test_tfidf))
    #print("\nSVM linear:", *svm_eval, sep="\n")

    with open(save_location, 'a', newline='\n') as newFile:
        newFileWriter = csv.writer(newFile)
        newFileWriter.writerow(['Tree', 
                                tree_md5_eval[0][1],
                                tree_md5_eval[1][1],
                                tree_md5_eval[2][1],
                                tree_md5_eval[3][1]])
        newFileWriter.writerow(['NaiveBayes', 
                                gnb_eval[0][1],
                                gnb_eval[1][1],
                                gnb_eval[2][1],
                                gnb_eval[3][1]])
        newFileWriter.writerow(['LR', 
                                lsr_eval[0][1],
                                lsr_eval[1][1],
                                lsr_eval[2][1],
                                lsr_eval[3][1]])
        newFileWriter.writerow(['SVM', 
                                svm_eval[0][1],
                                svm_eval[1][1],
                                svm_eval[2][1],
                                svm_eval[3][1]])

    # Baselines + embeddings
    # Classification Tree
    tree_md5 = DecisionTreeClassifier(max_depth=5, random_state=0)
    tree_md5.fit(train_embeddings, train_labels[:len(train_embeddings)])
    tree_md5_eval = evaluate(test_labels[:len(test_embeddings)],
                             tree_md5.predict(test_embeddings))
    #print('Tree md5:', *tree_md5_eval, sep="\n")

    # Gaussian NB
    gnb = GaussianNB()
    gnb.fit(train_embeddings, train_labels[:len(train_embeddings)])
    gnb_eval = evaluate(test_labels[:len(test_embeddings)],
                        gnb.predict(test_embeddings))
    #print('\nGaussian NB:', *gnb_eval, sep="\n")

    # Logistic Regression
    lsr = LogisticRegression(solver="lbfgs")
    lsr.fit(train_embeddings, train_labels[:len(train_embeddings)])
    lsr_eval = evaluate(test_labels[:len(test_embeddings)],
                        lsr.predict(test_embeddings))
    #print('\nLogistic Regression:', *lsr_eval, sep="\n")

    # SVM Linear
    svm = SVC(kernel='linear')
    svm.fit(train_embeddings, train_labels[:len(train_embeddings)])
    svm_eval = evaluate(test_labels[:len(test_embeddings)],
                        svm.predict(test_embeddings))
    #print('\nSVM linear:', *svm_eval, sep="\n")

    with open(save_location, 'a', newline='\n') as newFile:
        newFileWriter = csv.writer(newFile)
        newFileWriter.writerow(['Tree + embeddings', 
                                tree_md5_eval[0][1],
                                tree_md5_eval[1][1],
                                tree_md5_eval[2][1],
                                tree_md5_eval[3][1]])
        newFileWriter.writerow(['NaiveBayes + embeddings', 
                                gnb_eval[0][1],
                                gnb_eval[1][1],
                                gnb_eval[2][1],
                                gnb_eval[3][1]])
        newFileWriter.writerow(['LR + embeddings', 
                                lsr_eval[0][1],
                                lsr_eval[1][1],
                                lsr_eval[2][1],
                                lsr_eval[3][1]])
        newFileWriter.writerow(['SVM + embeddings', 
                                svm_eval[0][1],
                                svm_eval[1][1],
                                svm_eval[2][1],
                                svm_eval[3][1]])
    
    elapsed_time = time.time() - start_time
    timestamps.append(elapsed_time)
    
    remaining = sum(timestamps) / len(timestamps)
    remaining = remaining * (progress_iterations - i + 1)
    remaining = round(remaining / 60, 1)

    hash_count = round(progress_bar_width * ((i+1) / progress_iterations))
    hashes = '#' * hash_count
    spaces = ' ' * (progress_bar_width - hash_count)

    print(f"\rRemaining time: {remaining} min [{hashes}{spaces}]",
          end='')

total_elapsed_time = time.time() - total_start_time

print("")
print("\rPredictions completed. Total duration:",
      round(total_elapsed_time / 60, 1),
      "min")

Starting 10 predictions for each treatment
Results are stored in resultss.csv


TypeError: only integer scalar arrays can be converted to a scalar index