In [1]:
!pip install nltk



In [2]:
import csv                               # csv reader
import nltk
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import re


def load_data(path, raw_data):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t') ## whole of dataset
        for line in reader:  ## every line[i] is one feature (or label)
            if line[0] == "Id":  # skip header row 
                continue
            ## We read all the features here
            (sentence, subject, speaker, speaker_job_title, state_info, party_affiliation, total_barely_true_counts, 
           total_false_counts, total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context ,label) = parse_data_line(line)
            raw_data.append((sentence, subject, speaker, speaker_job_title, state_info, party_affiliation, total_barely_true_counts, 
           total_false_counts, total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context ,label))
    f.close()

def split_and_preprocess_data(raw_data, train_data, test_data, train_num_data, test_num_data, percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    global_feature_dict = Counter()
    num_training_samples = int((percentage * num_samples))
    tr_idx = 0
    for (sentence, subject, speaker, speaker_job_title, state_info, party_affiliation, total_barely_true_counts, 
           total_false_counts, total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context ,label) in raw_data[:num_training_samples]:
        ## split subject by ',' delimeter
        subject = re.split(',', subject)
        
        ## convert state_info to lowercase
        state_info = state_info.lower()
        
        ## consider the combination of sentence, sibject, and context as one single document (sentece) and pre-process it
        sentence = pre_process(sentence + " " + ' '.join(subject) + " " + context, stop_words)
        
        ## This is counter for other features --> speaker, speaker_job_title, state_info, party_affiliation
        other_feature = Counter()
        
        ## by this lopp, we avoid of inserting 'none':1 or '':1 to our featureset dictionary
        for feat in [speaker, speaker_job_title, state_info, party_affiliation]:
            if feat == 'none' or feat == '':
                continue
            else:
                other_feature.update({feat:1})
            
        ## Add textual features to our training textual dataset (train_data)
        train_data.append((to_feature_vector(sentence) + other_feature,label))
        
        ## Add numerical features to our training numerical dataset (train_num_data)
        train_num_data.append([total_barely_true_counts, total_false_counts, total_half_true_counts,
                               total_mostly_true_counts, total_pants_on_fire_counts, label])
        

    for (sentence, subject, speaker, speaker_job_title, state_info, party_affiliation, total_barely_true_counts, 
           total_false_counts, total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context ,label) in raw_data[num_training_samples:]:
        ## split subject by ',' delimeter
        subject = re.split(',', subject)
        
        ## convert state_info to lowercase
        state_info = state_info.lower()
        
        ## consider the combination of sentence, sibject, and context as one single document (sentece) and pre-process it
        sentence = pre_process(sentence + " " + ' '.join(subject) + " " + context, stop_words)
        
        ## This is counter for other features --> speaker, speaker_job_title, state_info, party_affiliation
        other_feature = Counter()
        
        ## by this lopp, we avoid of inserting 'none':1 or '':1 to our featureset dictionary
        for feat in [speaker, speaker_job_title, state_info, party_affiliation]:
            if feat == 'none' or feat == '':
                continue
            else:
                other_feature.update({feat:1})
                
        ## Add textual features to our testing textual dataset (test_data)
        test_data.append((to_feature_vector(sentence) + other_feature ,label))
        
        ## Add numerical features to our testing numerical dataset (test_num_data)
        test_num_data.append([total_barely_true_counts, total_false_counts, total_half_true_counts,
                               total_mostly_true_counts, total_pants_on_fire_counts, label])
        # tst_idx += 1

In [4]:
def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    #return label
    # Converting the multiclass labels to binary label
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]

def parse_data_line(data_line): ## input: 1 line of dataset
    # Should return a tuple of the label as just FAKE or REAL and the statement
    # e.g. (label, statement) 
    
    ## I make the features separated here, then pass them as a big tuple
    label = convert_label(data_line[1])
    sentence = data_line[2]
    # _id = data_line[0]
    subject = data_line[3]
    speaker = data_line[4]
    speaker_job_title = data_line[5]
    state_info = data_line[6]
    party_affiliation = data_line[7]
    total_barely_true_counts = data_line[8]
    total_false_counts = data_line[9]
    total_half_true_counts = data_line[10]
    total_mostly_true_counts = data_line[11]
    total_pants_on_fire_counts = data_line[12]
    context = data_line[13]
    
    return (sentence, subject, speaker, speaker_job_title, state_info, party_affiliation, total_barely_true_counts, 
           total_false_counts, total_half_true_counts, total_mostly_true_counts, total_pants_on_fire_counts, context ,label)

In [5]:
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
nltk.download('punkt')
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

## set of stop words from nltk corpus
stop_words = set(stopwords.words('english'))


# Input: a string of one statement 
# Output: list of tokens in that sentence

## return --> list of tokens
def pre_process(text, stop_words):
    ## Lowercase
    lower_text = text.lower()
    ## replace '-' by ' '
    lower_text = lower_text.replace('-', ' ')
    ## Punctuation
    no_p_text = "".join([char for char in lower_text if char not in string.punctuation])
    ## Tokenizing
    words = word_tokenize(no_p_text)
    no_stop_word = [word for word in words if word not in stop_words]
    ## Lemmization
    wordnet_lem = WordNetLemmatizer()
    ## Stemming
    # ps = PorterStemmer() 
    
    return [wordnet_lem.lemmatize(token) for token in no_stop_word]
    # return [ps.stem(token) for token in no_stop_word]
    # return no_stop_word


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
from collections import Counter


## input: one statement's token
## global_fd should be a counter
## global_sd should be a dict of set
## Global feature dictionary
## Global sentence dictionary --> {feature : {sent1, sent2, sent5, ...}}
def to_feature_vector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    ## Update global feature dictionary
    
    ## We insert unigram tokens by initializing token as a counter
    word_dict = Counter(tokens)
    
    ## Here we add bigram and trigram tokens to our dictionary
        
    # Add bigram
    bigram_keys = []
    for i in range(len(tokens)-1):
        new_key = tokens[i] + ' ' + tokens[i+1]
        bigram_keys.append(new_key)
    
    ## Update the counter
    word_dict.update(bigram_keys)

        
    ## Add trigram
    trigram_keys = []
    for i in range(len(tokens)-2):
        new_key = tokens[i] + ' ' + tokens[i+1] + ' ' + tokens[i+2]
        trigram_keys.append(new_key)

    ## Update the counter
    word_dict.update(trigram_keys)
        
    # DESCRIBE YOUR METHOD IN WORDS
    return word_dict


In [7]:
## This function will be applied to train set in cross validation, to compute both global dicts, for tf, idf
## This below function update global statement dictionary, which shows the index of sentences that contains each word
## {word1: {Sentence_i, Sentence_j, ....}, word2: {Sentece_k, ....}}
## It is basically a dict of set
def build_global_dicts(train_data, global_fd, global_sd):
    tr_idx = 0
    for tokens, label in train_data:
        global_fd.update(tokens)
        for token in tokens:
            global_sd[token].add(tr_idx)
        tr_idx += 1
        
        
from math import log10
from collections import defaultdict

## This function does the tf_idf (assign thw weights) for both train and test in cross validation
## Pay attention that the 'global_sentence_dict' is only filled by train data above, and there will be no data leakage.
def assign_weights(train_data, global_sentence_dict):
    for tokens, label in train_data:
        ## tokens is Counter
        f_td = tokens
        denom = [0,0,0]
        for token in tokens.keys():
            # For tf-idf in n-grams case
            ## Unigram
            if len(token.split()) == 1:
                denom[0] += 1
            ## Bigram
            elif len(token.split()) == 2:
                denom[1] += 1
            ## Trigram
            elif len(token.split()) == 3:
                denom[2] += 1
        for token, weight in tokens.items():
            ## Unigram
            if len(token.split()) == 1:
                tf = f_td[token] / denom[0]
            ## Bigram
            elif len(token.split()) == 2:
                tf = f_td[token] / denom[1]
            ## Trigram
            elif len(token.split()) == 3:
                tf = f_td[token] / denom[2]
        
            # tf = 2 * f_td[token] / sum(f_td.values())
                
            ## idf term
            idf = log10(len(train_data) / ( 1+len(global_sentence_dict[token]) ) ) + 1
            
            ## Final weight
            tokens[token] = tf * idf
        

In [8]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(C=2))])
    return SklearnClassifier(pipeline).train(data)

In [9]:
#solution
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn import svm
import copy

def cross_validate(dataset, num_dataset, folds):
    fold_size = int(len(dataset)/folds)
    last_iteration = False
    accuracy, recall, precision, f1 = [], [], [], []
    text_accuracy, text_recall, text_precision, text_f1 = [], [], [], []
    for test_start_idx in range(0, len(dataset), int(fold_size)):
        if test_start_idx == (folds-1)*fold_size:
            ## This is the last iteration
            print("test fold form {} to {}:".format(test_start_idx, len(dataset)))
            test_set = dataset[test_start_idx:]
            num_test_set = num_dataset[test_start_idx:]
            
            train_set = dataset[:test_start_idx]
            ## Textual dataset for trainin --> first half of train data for training of textual classifier
            train_set_text = train_set[0:3686]
            
            ## Textual dataset for testing --> second half of train data for testing of textual classifier
            test_set_text = train_set[3686:]
            
            num_train_set = num_dataset[:test_start_idx]
            
            ## Numerical dataset for training
            num_train_set = num_train_set[3686:]
            
            ## TF-IDF
            global_feature_dict = Counter()
            global_sentence_dict = defaultdict(set)
            build_global_dicts(train_set_text, global_feature_dict, global_sentence_dict)
            tmp_train = copy.deepcopy(train_set_text)
            tmp_test = copy.deepcopy(test_set_text)
            ## Assign TF-IDF weights here
            assign_weights(tmp_train, global_sentence_dict)
            assign_weights(tmp_test, global_sentence_dict)
             
            ## Train text classifier
            LinearSVC_classifier = train_classifier(tmp_train)
            ## Text feature is prediction which will be concated to numerical dataset
            text_featureset, a = separate_test_set(tmp_test)
            train_text_predicition = np.array(predict_labels(text_featureset, LinearSVC_classifier))
            
            ## Convert the predicition of text classifier (Real, Fake) to (1, 0) 
            le = preprocessing.LabelEncoder()
            train_text_predicition = le.fit_transform(train_text_predicition)
            
            ## Now we prepare data for our final model --> Numeric data from dataset + Prediction of text classifier on each sample in numerical train set
            ## Standardization of numeric data
            
            _num_train_set = copy.deepcopy(np.array(num_train_set))
            x_train_num = _num_train_set[:, :5]
            y_train_num = _num_train_set[:, -1]
            ## Add text feature to num_data
            x_train_num = np.concatenate((x_train_num, train_text_predicition.reshape(x_train_num.shape[0],1)),  axis=1)
            ## Now we normalize our new data consisting of 6 columns
            scaler = preprocessing.StandardScaler()
            x_train_num_standard = scaler.fit_transform(x_train_num)
            ## Now we train data on our final numeric classifier
            clf = svm.SVC(C=100, gamma=1)
            clf.fit(x_train_num_standard, y_train_num)
            
            
            ## So far we train our final classifier, now we test our text_classifier on test data
            test_featureset, test_labels = separate_test_set(test_set)
            test_text_prediction = predict_labels(test_featureset, LinearSVC_classifier)
            
            ## Now we print the result of text classifier on test data
            text_accuracy.append(accuracy_score(test_labels, test_text_prediction))
            text_recall.append(recall_score(test_labels, test_text_prediction, average='weighted'))
            text_precision.append(precision_score(test_labels, test_text_prediction, average='weighted'))
            text_f1.append(f1_score(test_labels, test_text_prediction, average='weighted'))
            print('This is the classification report with using text features only:')
            print('\n')
            print(classification_report(test_labels, test_text_prediction))
            
            ## Now we consider the text classifier prediction as a new feature for our numeric dataset
            ## Convert the predicition of text classifier (Real, Fake) to (1, 0) 
            _le = preprocessing.LabelEncoder()
            test_text_prediction = _le.fit_transform(test_text_prediction)
            
            
            ## Now we build our test dataset
            _num_test_set = copy.deepcopy(np.array(num_test_set))
            ## These are numerical features
            x_test_num = _num_test_set[:, :5]
            y_test_num = _num_test_set[:, -1]
            ## Add text feature to numerical features
            x_test_num = np.concatenate((x_test_num, test_text_prediction.reshape(x_test_num.shape[0],1)),  axis=1)
            ## Now we normalize our test_dataset with the standardizer that we already have
            x_test_num_standard = scaler.transform(x_test_num)
            
            ## Now we have test data, it's time for testing our final classifier --> clf
            final_prediction = clf.predict(x_test_num_standard)
            
            ## This is the final metric for our combination of both textual and numerical classifiers
            accuracy.append(accuracy_score(test_labels, final_prediction))
            recall.append(recall_score(test_labels, final_prediction, average='weighted'))
            precision.append(precision_score(test_labels, final_prediction, average='weighted'))
            f1.append(f1_score(test_labels, final_prediction, average='weighted'))
            print('This is the classification report using the combination of numerical and text features:')
            print('\n')
            print(classification_report(test_labels, final_prediction))
            print('---------------------------------------------------------')
            print('---------------------------------------------------------')
            break
            
        else:
            if test_start_idx == 0:
                test_set = dataset[test_start_idx:(test_start_idx+fold_size)]
                num_test_set = num_dataset[test_start_idx:(test_start_idx+fold_size)]
                
                train_set = dataset[test_start_idx+fold_size:]
                ## Textual dataset for trainin --> first half of train data for training of textual classifier
                train_set_text = train_set[0:3686]
                
                ## Textual dataset for testing --> second half of train data for testing of textual classifier
                test_set_text = train_set[3686:]
                
                # print('This is train_set size: {}'.format(len(train_set)))
                num_train_set = num_dataset[test_start_idx+fold_size:]
                num_train_set = num_train_set[3686:]
                print("test fold form {} to {}:".format(test_start_idx, test_start_idx+fold_size-1))
            else:
                test_set = dataset[test_start_idx:(test_start_idx+fold_size)]
                num_test_set = num_dataset[test_start_idx:(test_start_idx+fold_size)]
                
                train_set = dataset[0:test_start_idx] + dataset[(test_start_idx+fold_size):]
                ## Textual dataset for trainin --> first half of train data for training of textual classifier
                train_set_text = train_set[0:3686]
                ## Textual dataset for testing --> second half of train data for testing of textual classifier
                test_set_text = train_set[3686:]
                
                # print('This is train_set size: {}'.format(len(train_set)))
                num_train_set = num_dataset[0:test_start_idx] + num_dataset[(test_start_idx+fold_size):]
                
                ## Numerical dataset for training
                num_train_set = num_train_set[3686:]
                print("test fold form {} to {}:".format(test_start_idx, test_start_idx+fold_size-1))
            
            ## TF-IDF
            global_feature_dict = Counter()
            global_sentence_dict = defaultdict(set)
            build_global_dicts(train_set_text, global_feature_dict, global_sentence_dict)
            tmp_train = copy.deepcopy(train_set_text)
            tmp_test = copy.deepcopy(test_set_text)
            ## Assign TF-IDF weights here
            assign_weights(tmp_train, global_sentence_dict)
            assign_weights(tmp_test, global_sentence_dict)
             
            ## Train text classifier
            LinearSVC_classifier = train_classifier(tmp_train)
            ## Text feature is prediction which will be concated to numerical dataset
            text_featureset, a = separate_test_set(tmp_test)
            train_text_predicition = np.array(predict_labels(text_featureset, LinearSVC_classifier))
            
            ## Convert the predicition of text classifier (Real, Fake) to (1, 0) 
            le = preprocessing.LabelEncoder()
            train_text_predicition = le.fit_transform(train_text_predicition)
            
            ## Now we prepare data for our final model --> Numeric data from dataset + Prediction of text classifier on each sample in numerical train set
            ## Standardization of numeric data
            
            _num_train_set = copy.deepcopy(np.array(num_train_set))
            x_train_num = _num_train_set[:, :5]
            y_train_num = _num_train_set[:, -1]
            ## Add text feature to num_data
            x_train_num = np.concatenate((x_train_num, train_text_predicition.reshape(x_train_num.shape[0],1)),  axis=1)
            ## Now we normalize our new data consisting of 6 columns
            scaler = preprocessing.StandardScaler()
            x_train_num_standard = scaler.fit_transform(x_train_num)
            ## Now we train data on our final numeric classifier
            clf = svm.SVC(C=100, gamma=1)
            clf.fit(x_train_num_standard, y_train_num)
            
            
            ## So far we train our final classifier, now we test our text_classifier on test data
            test_featureset, test_labels = separate_test_set(test_set)
            test_text_prediction = predict_labels(test_featureset, LinearSVC_classifier)
            
            ## Now we print the result of text classifier on test data
            text_accuracy.append(accuracy_score(test_labels, test_text_prediction))
            text_recall.append(recall_score(test_labels, test_text_prediction, average='weighted'))
            text_precision.append(precision_score(test_labels, test_text_prediction, average='weighted'))
            text_f1.append(f1_score(test_labels, test_text_prediction, average='weighted'))
            print('This is the classification report with using text features only:')
            print('\n')
            print(classification_report(test_labels, test_text_prediction))
            
            ## Now we consider the text classifier prediction as a new feature for our numeric dataset
            ## Convert the predicition of text classifier (Real, Fake) to (1, 0) 
            _le = preprocessing.LabelEncoder()
            test_text_prediction = _le.fit_transform(test_text_prediction)
            
            
            ## Now we build our test dataset
            _num_test_set = copy.deepcopy(np.array(num_test_set))
            ## These are numerical features
            x_test_num = _num_test_set[:, :5]
            y_test_num = _num_test_set[:, -1]
            ## Add text feature to numerical features
            x_test_num = np.concatenate((x_test_num, test_text_prediction.reshape(x_test_num.shape[0],1)),  axis=1)
            ## Now we normalize our test_dataset with the standardizer that we already have
            x_test_num_standard = scaler.transform(x_test_num)
            
            ## Now we have test data, it's time for testing our final classifier --> clf
            final_prediction = clf.predict(x_test_num_standard)
            
            ## This is the final metric for our combination of both textual and numerical classifiers
            accuracy.append(accuracy_score(test_labels, final_prediction))
            recall.append(recall_score(test_labels, final_prediction, average='weighted'))
            precision.append(precision_score(test_labels, final_prediction, average='weighted'))
            f1.append(f1_score(test_labels, final_prediction, average='weighted'))
            print('This is the classification report using the combination of numerical and text features:')
            print('\n')
            print(classification_report(test_labels, final_prediction))
            print('---------------------------------------------------------')
            print('---------------------------------------------------------')
        
        # FILL IN THE METHOD HERE
    return [np.mean(accuracy), np.mean(recall), np.mean(precision), np.mean(f1)], [np.mean(text_accuracy), np.mean(text_recall), np.mean(text_precision), np.mean(text_f1)]

In [10]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

## This function just seperate a dataset into features and labels.
def separate_test_set(test_set):
    feature_set = []
    labels = []
    for pair in test_set:
        feature_set.append(pair[0])
        labels.append(pair[1])
    return feature_set, labels

In [11]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path, raw_data) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

train_num_data = []
test_num_data = []
split_and_preprocess_data(raw_data=raw_data, train_data=train_data, test_data=test_data, train_num_data=train_num_data, 
                          test_num_data=test_num_data, percentage=0.8)

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), sep='\n')

# print(train_data)
# print('---------------------------------')
# print(test_data)
# print(train_data[1023:1028])

# cross_validate(dataset=train_data, folds=10)


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 10241 rawData, 8192 trainData, 2049 testData
Training Samples: 
8192


In [12]:
print(len(train_num_data))

8192


In [13]:
## A sample of dataset before applying tf-idf (I applied it in cross validation loop for each iteration separately.)
print(train_data[1980])

(Counter({'welfare': 2, 'congress': 1, 'approves': 1, 'bill': 1, 'offering': 1, 'free': 1, 'car': 1, 'recipient': 1, 'web': 1, 'post': 1, 'congress approves': 1, 'approves bill': 1, 'bill offering': 1, 'offering free': 1, 'free car': 1, 'car welfare': 1, 'welfare recipient': 1, 'recipient welfare': 1, 'welfare web': 1, 'web post': 1, 'congress approves bill': 1, 'approves bill offering': 1, 'bill offering free': 1, 'offering free car': 1, 'free car welfare': 1, 'car welfare recipient': 1, 'welfare recipient welfare': 1, 'recipient welfare web': 1, 'welfare web post': 1, 'americannewscom': 1}), 'FAKE')


In [14]:
## Corresponding raw data to compare with above, to see the effect of pre-processing
print(raw_data[1980])

('Congress Approves Bill Offering Free Cars To Welfare Recipients.', 'welfare', 'americannewscom', '', '', 'none', '0', '0', '0', '0', '2', 'a web post', 'FAKE')


# Hyper-parameter Tuning

In [15]:
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV


def hyperparameter_tuning(train_data):

    global_feature_dict = Counter()
    global_sentence_dict = defaultdict(set)
    split_idx = int(len(train_data)/2)
    
    text_train = copy.deepcopy(train_data[:split_idx])
    text_test = copy.deepcopy(train_data[split_idx:])
    
    num_train = copy.deepcopy(train_num_data[split_idx:])
    # num_test = test_num_data[split_idx:]
    
    ## TF-IDF
    build_global_dicts(text_train, global_feature_dict, global_sentence_dict)
    assign_weights(text_train, global_sentence_dict)
    assign_weights(text_test, global_sentence_dict)
    LinearSVC_classifier = train_classifier(text_train)
    test_featureset, test_labels = separate_test_set(text_test)
    train_text_prediction = np.array(predict_labels(test_featureset, LinearSVC_classifier))
    
    ## Now we add 'text_prediction' as a feature to 'num_train'
    ## Normalizing
    le = preprocessing.LabelEncoder()
    train_text_prediction = le.fit_transform(train_text_prediction)
    
    num_train = copy.deepcopy(np.array(num_train))
    x_train_num = num_train[:, :5]
    y_train_num = num_train[:, -1]
    
    x_train_num = np.concatenate((x_train_num, train_text_prediction.reshape(x_train_num.shape[0],1)),  axis=1)
    ## Now we normalize our new data consisting of 6 columns
    scaler = preprocessing.StandardScaler()
    x_train_num_standard = scaler.fit_transform(x_train_num)
    
    ## Now we do hyperparameter optimization
    
    # defining parameter range
    param_grid = {'C': [1, 10, 100, 1000], 
              'gamma': [10, 1, 0.1, 0.01, 0.001],
              'kernel': ['rbf', 'sigmoid']} 
  
    
    grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
  
    # fitting the model for grid search
    grid.fit(x_train_num_standard, y_train_num)
    print('The best possible parameters are:')
    print(grid.best_params_)

hyperparameter_tuning(train_data)

Training Classifier...
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .........C=1, gamma=10, kernel=rbf;, score=0.727 total time=   0.3s
[CV 2/5] END .........C=1, gamma=10, kernel=rbf;, score=0.713 total time=   0.3s
[CV 3/5] END .........C=1, gamma=10, kernel=rbf;, score=0.706 total time=   0.3s
[CV 4/5] END .........C=1, gamma=10, kernel=rbf;, score=0.745 total time=   0.3s
[CV 5/5] END .........C=1, gamma=10, kernel=rbf;, score=0.734 total time=   0.3s
[CV 1/5] END .....C=1, gamma=10, kernel=sigmoid;, score=0.496 total time=   0.4s
[CV 2/5] END .....C=1, gamma=10, kernel=sigmoid;, score=0.526 total time=   0.4s
[CV 3/5] END .....C=1, gamma=10, kernel=sigmoid;, score=0.487 total time=   0.4s
[CV 4/5] END .....C=1, gamma=10, kernel=sigmoid;, score=0.502 total time=   0.4s
[CV 5/5] END .....C=1, gamma=10, kernel=sigmoid;, score=0.505 total time=   0.4s
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.638 total time=   0.3s
[CV 2/5] END ..........C

# Cross Validation

In [16]:
[accuracy, recall, precision, _f1_score], [text_accuracy, text_recall, text_precision, text_f1_score] = cross_validate(train_data, train_num_data, 10)  # will work and output overall performance of p, r, f-score when cv implemented
print('This is the final result of prediction using text features only:')
print('Accuracy is: {}'.format(text_accuracy))
print('Weighted Recall is: {}'.format(text_recall))
print('Weighted Precision is: {}'.format(text_precision))
print('Weighted F1_score is: {}'.format(text_f1_score))
print('-------------------------------------------------------')
print('This is the final result of prediction using combination of numerical and text features:')
print('Accuracy is: {}'.format(accuracy))
print('Weighted Recall is: {}'.format(recall))
print('Weighted Precision is: {}'.format(precision))
print('Weighted F1_score is: {}'.format(_f1_score))

test fold form 0 to 818:
Training Classifier...
This is the classification report with using text features only:


              precision    recall  f1-score   support

        FAKE       0.53      0.64      0.58       338
        REAL       0.70      0.60      0.64       481

    accuracy                           0.61       819
   macro avg       0.61      0.62      0.61       819
weighted avg       0.63      0.61      0.62       819

This is the classification report using the combination of numerical and text features:


              precision    recall  f1-score   support

        FAKE       0.70      0.66      0.68       338
        REAL       0.77      0.80      0.78       481

    accuracy                           0.74       819
   macro avg       0.73      0.73      0.73       819
weighted avg       0.74      0.74      0.74       819

---------------------------------------------------------
---------------------------------------------------------
test fold form 819 to 163

# Final Test

In [17]:
# Train on 20% Test
def final_training(train_data, train_num_data, test_data, test_num_data):
    global_feature_dict = Counter()
    global_sentence_dict = defaultdict(set)
    ## This index is for halving the train dataset for my method in training which is fully explained in report and alswo inside the cross validation function
    split_idx = int(len(train_data)/2)
    
    ## These two dataset are for training and testing the textual classifier (train of half of train data, test on the other half)
    text_train = copy.deepcopy(train_data[:split_idx])
    text_test = copy.deepcopy(train_data[split_idx:])
    
    ## training dataset with numerical features (second half of dataset, because we train our textual classifiers on the first half and we know the true label)
    num_train = copy.deepcopy(train_num_data[split_idx:])
    
    ## TF-IDF
    build_global_dicts(text_train, global_feature_dict, global_sentence_dict)
    assign_weights(text_train, global_sentence_dict)
    assign_weights(text_test, global_sentence_dict)
    LinearSVC_classifier = train_classifier(text_train)
    test_featureset, test_labels = separate_test_set(text_test)
    train_text_prediction = np.array(predict_labels(test_featureset, LinearSVC_classifier))
    
    ## Now we add 'text_prediction' as a feature to 'num_train'
    ## Normalizing
    le = preprocessing.LabelEncoder()
    train_text_prediction = le.fit_transform(train_text_prediction)
    
    num_train = copy.deepcopy(np.array(num_train))
    x_train_num = num_train[:, :5]
    y_train_num = num_train[:, -1]
    
    x_train_num = np.concatenate((x_train_num, train_text_prediction.reshape(x_train_num.shape[0],1)),  axis=1)
    ## Now we normalize our new data consisting of 6 columns (5 numerical features + 1 features which is prediction by textual classifier)
    scaler = preprocessing.StandardScaler()
    x_train_num_standard = scaler.fit_transform(x_train_num)
    ## Now we train data on our final numeric classifier
    clf = svm.SVC(C=100, gamma=1)
    clf.fit(x_train_num_standard, y_train_num)
    
    ## So far we train our combination of classifiers, now we test our text_classifier on test data, to add it as a new column to the numeric dataset for training numerical classifier
    test_featureset, test_labels = separate_test_set(test_data)
    test_text_prediction = predict_labels(test_featureset, LinearSVC_classifier)
    
    ## Prediction with only text features
    print('This is the result using text features only:')
    print('Accuracy is: {}'.format(accuracy_score(test_labels, test_text_prediction)))
    print('Weighted Recall is: {}'.format(recall_score(test_labels, test_text_prediction, average='weighted')))
    print('Weighted Precision is: {}'.format(precision_score(test_labels, test_text_prediction, average='weighted')))
    print('Weighted F1_score is: {}'.format(f1_score(test_labels, test_text_prediction, average='weighted')))
    print('\n')
    print(classification_report(test_labels, test_text_prediction))
    print('------------------------------------------------------------')
    
    ## Now we consider the text classifier prediction as a new feature for our numeric dataset
    ## Convert the predicition of text classifier (Real, Fake) to (1, 0) 
    _le = preprocessing.LabelEncoder()
    test_text_prediction = _le.fit_transform(test_text_prediction)
    
    ## Now we build our test dataset
    test_num_data = copy.deepcopy(np.array(test_num_data))
    x_test_num = test_num_data[:, :5]
    y_test_num = test_num_data[:, -1]
    ## Add text feature to num_data
    x_test_num = np.concatenate((x_test_num, test_text_prediction.reshape(x_test_num.shape[0],1)),  axis=1)
    ## Now we normalize our test_dataset with the standardizer that we already have
    x_test_num_standard = scaler.transform(x_test_num)

    ## Now we have test data, it's time for testing our final classifier --> clf
    final_prediction = clf.predict(x_test_num_standard)
    
    print('This is the result using the combination of numerical and text features:')
    print('Accuracy is: {}'.format(accuracy_score(test_labels, final_prediction)))
    print('Weighted Recall is: {}'.format(recall_score(test_labels, final_prediction, average='weighted')))
    print('Weighted Precision is: {}'.format(precision_score(test_labels, final_prediction, average='weighted')))
    print('Weighted F1_score is: {}'.format(f1_score(test_labels, final_prediction, average='weighted')))
    print('\n')
    print(classification_report(test_labels, final_prediction))
    
final_training(train_data, train_num_data, test_data, test_num_data)

Training Classifier...
This is the result using text features only:
Accuracy is: 0.6051732552464617
Weighted Recall is: 0.6051732552464617
Weighted Precision is: 0.6073281993614473
Weighted F1_score is: 0.6058770666224415


              precision    recall  f1-score   support

        FAKE       0.56      0.59      0.57       926
        REAL       0.65      0.62      0.63      1123

    accuracy                           0.61      2049
   macro avg       0.60      0.60      0.60      2049
weighted avg       0.61      0.61      0.61      2049

------------------------------------------------------------
This is the result using the combination of numerical and text features:
Accuracy is: 0.7242557345046364
Weighted Recall is: 0.7242557345046364
Weighted Precision is: 0.7250399949024342
Weighted F1_score is: 0.7209054051657503


              precision    recall  f1-score   support

        FAKE       0.73      0.62      0.67       926
        REAL       0.72      0.81      0.76      1