In [1]:
import pandas as pd
import numpy as np
import math
from time import sleep
import re
import string
import random
from collections import Counter
import sys
np.set_printoptions(threshold=sys.maxsize)
from nltk.tokenize import sent_tokenize, word_tokenize 
  
import gensim 
from gensim.models import Word2Vec 
from sklearn.utils import shuffle

unable to import 'smart_open.gcs', disabling that module


In [3]:
#Jesse Bristow (1875955)
#Philani Mpofu (1848751)
#Matthew Kruger (1669326)
#Chloe Smith (1877342)

In [4]:
#Load data from .csv file
def loadData(name):
    return pd.read_csv(name, names = ['ID','Title','Author','Text','Label'])

In [5]:
#Split cleaned data in training, validation, and testing data
def getAllData(data):
    seventy_percent_of_data = int(data.shape[0]*0.7)
    twenty_percent_of_data = int(data.shape[0]*0.2)
    
    train_data = data[:seventy_percent_of_data]
    valid_data = data[seventy_percent_of_data:seventy_percent_of_data+twenty_percent_of_data]
    test_data = data[seventy_percent_of_data+twenty_percent_of_data:]
    data = None
    
    #Get author trustworthyness
    author_trust_scores = getAuthorTrustworthyness(train_data)

    train_data = convertRowsToDataPoints(train_data, testEmbeddingModel, test_model_vocab, 202, author_trust_scores)
    valid_data = convertRowsToDataPoints(valid_data, testEmbeddingModel, test_model_vocab, 202, author_trust_scores)
    test_data = convertRowsToDataPoints(test_data, testEmbeddingModel, test_model_vocab, 202, author_trust_scores)

    train_data_Y = train_data[:,201]
    valid_data_Y = valid_data[:,201]
    test_data_Y = test_data[:,201]

    train_data = np.delete(train_data, 201, 1)
    valid_data = np.delete(valid_data, 201, 1)
    test_data = np.delete(test_data, 201, 1)
    
    return train_data, train_data_Y, valid_data, valid_data_Y, test_data, test_data_Y


In [6]:
#Train and return embedding model used to convert words into vectors
def getEmbeddingModel(allData, dimension_for_word):
    #Store all titles and text
    allTitles = allData[:,1]
    allText = allData[:,3]
    allTitlesAndText = np.concatenate((allText,allTitles))
    allTitles = None
    allText = None
    
    #Store in format(rows are sentences and columns in row are words) used to train the embedding model
    dataForEmbedding = []
    remove = string.punctuation
    
    for currTitleOrText in allTitlesAndText:
        #For each title or text convert to list sentence and add to data
        
        for j in sent_tokenize(currTitleOrText):
            #For each sentence in the title or text
                
            temp = []
            
            for k in word_tokenize(j):
                # tokenize the sentence into words           
                curr_word = k + ''
                #remove all punctuation from word
                curr_word = curr_word.translate(str.maketrans('', '', remove))
                
                #If word is longer than 0 then convert to lowercase and store
                if len(curr_word) > 0:
                    temp.append(curr_word.lower()) 
  
            if len(temp) > 0:
                dataForEmbedding.append(temp) 
            
    
    #Train embedding model with sentences.
    embeddingModel = gensim.models.Word2Vec(dataForEmbedding, min_count = 1,  size = dimension_for_word, window = 5)
    
    return embeddingModel

In [7]:

def getAuthorTrustworthyness(allData):
    allAuthors = allData[:,2]
    allLabels = allData[:,4]
    tmp_author_trust = {};
    
    #For each author
    for i in range(len(allAuthors)):
        #input info into author trust to perform trustworthy score calculations afterward
        curr_author = allAuthors[i]
        curr_label = allLabels[i]
        if curr_author not in tmp_author_trust.keys():
            tmp_author_trust[curr_author] = [0.0, 0.0]
            
        if curr_label == 0.0:
            tmp_author_trust[curr_author][0]+=1.0
        elif curr_label == 1.0:
            tmp_author_trust[curr_author][1]+=1.0
    
    author_trust = {};
    #For each author in dictionary
    for curr_author in tmp_author_trust.keys():
        #calculate trustworthy score of author
        curr_score = (tmp_author_trust[curr_author][0])/(tmp_author_trust[curr_author][0]+tmp_author_trust[curr_author][1])
        author_trust[curr_author] = curr_score/1000.0
    
    return author_trust

In [8]:
#Clean data so that it can be used to train the embedding model/SVM
def getCleanedData(data):
    #Remove Nan, remove punctuation, and new lines
    feature_names = np.array(['ID','Title','Author','Text'])
    
    #Convert NaNs
    for i in range(3):
        string_replacement = ""
        if i==1:
            string_replacement = "-NO AUTHOR-"
        else:
            string_replacement = "NaN"

        for j in range(len(data)):
            if pd.isnull(data[j][i+1]):
                data[j][2] = string_replacement
        
    
    #Store punctuation to remove
    remove = string.punctuation
    remove = remove.replace(".","“")
    remove = remove.replace("!","”")
    remove = remove.replace("?","’")
    remove = remove + '‘'
    remove = remove + '—'
    remove = remove + '–'

    #Remove NaNs
    data = data[np.all(data != "NaN", axis = 1)]
    
    #Remove punctuation(except '.','?','!') and new lines
    for i in range(3):
        for j in range(len(data)):
            data[j][i+1] = data[j][i+1].replace("\n","").translate(str.maketrans('', '', remove))
    
    #Return feature names and the data
    return feature_names, data

In [9]:
#Convert all datapoints into a form that can be used to train the SVM
def convertRowsToDataPoints(data, embedding_model, model_vocab, num_expected_columns, author_trust_scores): 
    data_points_with_labels_list = []
    for row in data:
        convertedRow = convertRowToDataPoint(row, embedding_model, model_vocab, author_trust_scores)
        if len(convertedRow) == num_expected_columns:
            data_points_with_labels_list.append(convertedRow)
        
    data_points_with_labels = np.array(data_points_with_labels_list)
    return data_points_with_labels

In [10]:
#Convert single datapoint into a form that can be used to train the SVM
def convertRowToDataPoint(row, embedding_model, model_vocab, author_trust_scores):
    #Take in row without label and convert to data point
    #Stores author points, text points then title points then label
    curr_title = row[1]
    curr_text = row[3]
    curr_author = row[4]
    curr_author_trust_score = 0.0005
    if curr_author in author_trust_scores.keys():
        curr_author_trust_score = author_trust_scores[curr_author]
    
    data_point = np.array([row[4]])
    data_point = np.append(np.array([curr_author_trust_score]), data_point)
    data_point = np.append(getAverageEmbedding(curr_title, embedding_model, model_vocab), data_point)
    data_point = np.append(getAverageEmbedding(curr_text, embedding_model, model_vocab), data_point)
    return data_point

In [11]:
#Get average embedding values for words used in title/article
def getAverageEmbedding(curr_string, embedding_model, model_vocab):
    #Take in string that has already been cleaned of punctuation and newlines
    total_words = 0.0
    stop_words = ['not', 'you', 'at', 'from', 'of', 'us', 'in', 'have', 'yes', 'no', 'are', '', 'for', 'but', 'that', 'it', 'this','he','she', 'they','that','a','an', 'who', 'where','there', 'his','her','their', 'i','my','we','our','were', 'the','if','as', 'and','in','on','we','to', 'also','so','is','its']
    remove = string.punctuation
    curr_embedding = np.array([])
    for k in word_tokenize(curr_string):
        curr_word = k + ''
        #remove all punctuation from word
        curr_word = curr_word.translate(str.maketrans('', '', remove))
        #convert to lowercase
        curr_word = curr_word.lower()
        #check if it is a stop word or empty
        if curr_word in stop_words:
            continue
        #check if word is in model_vocab
        if curr_word not in model_vocab:
            continue
        
        if len(curr_embedding) == 0:
            curr_embedding = embedding_model[curr_word]
        else:
            curr_embedding = curr_embedding+embedding_model[curr_word]
        
        total_words = total_words + 1.0
    
    curr_embedding = curr_embedding/total_words
    return curr_embedding

In [12]:
#Print results of model on data
def printAccuracy(trained_model, testing_data, testing_Y):
    total_predicted = 0.0
    total_correct = 0.0
    confusion_matrix = np.array([[0, 0],[0, 0]])
    
    
    for i in range(len(testing_data)):
        curr_data_point = testing_data[i]
        curr_prediction = trained_model.predict([curr_data_point])
        curr_answer = testing_Y[i]
        
        confusion_matrix[int(curr_prediction)][int(curr_answer)]+=1
        if curr_answer == curr_prediction:
            total_correct = total_correct+1
            
        total_predicted = total_predicted+1
        
    percent_correct = total_correct*100/total_predicted
    trained_model.printHyperparameters()
    print("Accuracy: ", percent_correct)
    print("Confusion Matrix:\n",confusion_matrix, "\n")

In [13]:
#Support vector machine class
class Support_Vector_Classifier:
    
    def __init__(self, regularization_strength, learning_rate, convergence_const, num_iterations_training):
        self.regularization_strength = regularization_strength
        self.learning_rate = learning_rate
        self.convergence_const = convergence_const
        self.num_iterations_training = num_iterations_training
        self.weights = None
        self.X = None
        self.Y = None
        
        
    def train(self,X_inp,Y_inp):
        #Initialise datamembers
        self.X = np.insert(X_inp, X_inp.shape[1], np.full(X_inp.shape[0],1), axis=1)
        self.weights = np.zeros(self.X.shape[1])
        self.Y = Y_inp
        prev_cost_value = -1000
        curr_iteration = 0
        has_converged = False
        
        while (curr_iteration<self.num_iterations_training) and (not has_converged):
            #Shuffle data set
            self.X, self.Y = shuffle(self.X, self.Y, random_state=0)
            #For each row get convergence and update
            for i in range(self.X.shape[0]):
                curr_Xi = self.X[i]
                curr_Yi = self.getYi(self.Y[i])
                curr_conv_grad = self.getConvGrad(curr_Xi,curr_Yi)
                self.weights = self.weights - (self.learning_rate*curr_conv_grad)
                
            curr_cost_value = self.getCostFunction()
            curr_cost_diff = abs(prev_cost_value-curr_cost_value)
            if curr_cost_diff<self.convergence_const:
                has_converged = True
                
            curr_iteration = curr_iteration+1
            
        
    def predict(self,predict_X):
        #Add 1 to end col
        predict_X = np.append(predict_X,np.array(1))
        dot_prod = np.dot(self.weights, predict_X)
        if dot_prod < 0:
            return 0
        else:
            return 1
    
    def getConvGrad(self,Xi,Yi):
        curr_slack = self.getCurrSlack(Xi,Yi)
        if curr_slack == 0:
            return self.weights
        else:
            return self.weights-(self.regularization_strength*Yi*Xi)
        
    def getCostFunction(self):
        curr_cost = 0.0
        total_rows = self.X.shape[0]
        for i in range(self.X.shape[0]):
            xi = self.X[i]
            yi = self.getYi(self.Y[i])
            curr_cost = curr_cost + self.getHalfWLengthSqaured() + self.regularization_strength*self.getCurrSlack(xi,yi)
            
        curr_cost = curr_cost/total_rows
        return curr_cost
        
    def getYi(self, curr_y_value):
        if curr_y_value == 0.0:
            return -1
        else:
            return 1
    
    def getCurrSlack(self, Xi, Yi):
        return max(0,1-Yi*(np.dot(self.weights,Xi)))
    
    def getHalfWLengthSqaured(self):
        return (np.dot(self.weights, self.weights)**2)/2.0
        
    def printHyperparameters(self):
        print("Current model hyperparameters:\nRegularization strength: "+
              str(self.regularization_strength)+"\nLearning rate: "+
              str(self.learning_rate)+"\nTermination constant: "+
              str(self.convergence_const)+"\nMax number of iterations for training: "+
              str(self.num_iterations_training))

In [14]:
%%time
#Load data
name = 'Actual Data.csv'
data = loadData(name)

#Convert data to numpy array
data = data.to_numpy()
#Clean data 
test_feature_names, data = getCleanedData(data) 
#Train and get embedding model
testEmbeddingModel = getEmbeddingModel(data, 100)
#Get list of words used inside the embedding model
test_model_vocab = testEmbeddingModel.wv.vocab

Wall time: 10min 15s


In [15]:
%%time
#Split data into training/validation/testing data and set data = None to save memory
train_data, train_data_Y, valid_data, valid_data_Y, test_data, test_data_Y = getAllData(data)
data = None



Wall time: 7min 52s


In [20]:
#Train multiple SVC's on validation data to find suitable hyper-parameters
my_SVC0 = Support_Vector_Classifier(10000, 0.0000001, 500, 1000)
my_SVC0.train(train_data,train_data_Y)
my_SVC1 = Support_Vector_Classifier(1000, 0.0000001, 500, 1000)
my_SVC1.train(train_data,train_data_Y)
my_SVC2 = Support_Vector_Classifier(100000, 0.0000001, 500, 1000)
my_SVC2.train(train_data,train_data_Y)
my_SVC3 = Support_Vector_Classifier(10000, 0.000001, 500, 1000)
my_SVC3.train(train_data,train_data_Y)
my_SVC4 = Support_Vector_Classifier(10000, 0.00000001, 500, 1000)
my_SVC4.train(train_data,train_data_Y)
my_SVC5 = Support_Vector_Classifier(100000, 0.00000001, 500, 1000)
my_SVC5.train(train_data,train_data_Y)
my_SVC6 = Support_Vector_Classifier(1000, 0.000001, 500, 1000)
my_SVC6.train(train_data,train_data_Y)
my_SVC7 = Support_Vector_Classifier(100000, 0.000001, 500, 1000)
my_SVC7.train(train_data,train_data_Y)
my_SVC8 = Support_Vector_Classifier(1000, 0.00000001, 500, 1000)
my_SVC8.train(train_data,train_data_Y)
my_SVC9 = Support_Vector_Classifier(50000, 0.00000001, 500, 1000)
my_SVC9.train(train_data,train_data_Y)
my_SVC10 = Support_Vector_Classifier(5000, 0.000001, 500, 1000)
my_SVC10.train(train_data,train_data_Y)
my_SVC11 = Support_Vector_Classifier(12500, 0.0000001, 500, 1000)
my_SVC11.train(train_data,train_data_Y)
my_SVC12 = Support_Vector_Classifier(7500, 0.0000001, 500, 1000)
my_SVC12.train(train_data,train_data_Y)
my_SVC13 = Support_Vector_Classifier(10000, 0.0000003, 500, 1000)
my_SVC13.train(train_data,train_data_Y)
my_SVC14 = Support_Vector_Classifier(10000, 0.00000007, 500, 1000)
my_SVC14.train(train_data,train_data_Y)

In [21]:
#Print accuracy of all models
printAccuracy(my_SVC0, valid_data, valid_data_Y)
printAccuracy(my_SVC1, valid_data, valid_data_Y)
printAccuracy(my_SVC2, valid_data, valid_data_Y)
printAccuracy(my_SVC3, valid_data, valid_data_Y)
printAccuracy(my_SVC4, valid_data, valid_data_Y)
printAccuracy(my_SVC5, valid_data, valid_data_Y)
printAccuracy(my_SVC6, valid_data, valid_data_Y)
printAccuracy(my_SVC7, valid_data, valid_data_Y)
printAccuracy(my_SVC8, valid_data, valid_data_Y)
printAccuracy(my_SVC9, valid_data, valid_data_Y)
printAccuracy(my_SVC10, valid_data, valid_data_Y)
printAccuracy(my_SVC11, valid_data, valid_data_Y)
printAccuracy(my_SVC12, valid_data, valid_data_Y)
printAccuracy(my_SVC13, valid_data, valid_data_Y)
printAccuracy(my_SVC14, valid_data, valid_data_Y)

Current model hyperparameters:
Regularization strength: 10000
Learning rate: 1e-07
Termination constant: 500
Max number of iterations for training: 1000
Accuracy:  94.110337972167
Confusion Matrix:
 [[1966   96]
 [ 141 1821]] 

Current model hyperparameters:
Regularization strength: 1000
Learning rate: 1e-07
Termination constant: 500
Max number of iterations for training: 1000
Accuracy:  93.61332007952286
Confusion Matrix:
 [[1937   87]
 [ 170 1830]] 

Current model hyperparameters:
Regularization strength: 100000
Learning rate: 1e-07
Termination constant: 500
Max number of iterations for training: 1000
Accuracy:  94.06063618290258
Confusion Matrix:
 [[1966   98]
 [ 141 1819]] 

Current model hyperparameters:
Regularization strength: 10000
Learning rate: 1e-06
Termination constant: 500
Max number of iterations for training: 1000
Accuracy:  93.98608349900596
Confusion Matrix:
 [[1960   95]
 [ 147 1822]] 

Current model hyperparameters:
Regularization strength: 10000
Learning rate: 1e-08

It can be seen the hyper-parameters with the following values has performed the best on the validation data:

Regularization strength: 10000

Learning rate: 1e-07

Termination constant: 500

Max number of iterations for training: 1000


The model with these settings is then used with the testing data.

In [22]:
#Print accuracy of model on testing data with best performing hyper-parameters
printAccuracy(my_SVC0, test_data, test_data_Y)

Current model hyperparameters:
Regularization strength: 10000
Learning rate: 1e-07
Termination constant: 500
Max number of iterations for training: 1000
Accuracy:  92.94234592445328
Confusion Matrix:
 [[928  61]
 [ 81 942]] 

