In [249]:
import re
import numpy as np
import pandas as pd
import nltk # natural language toolkit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [250]:
# Set the dataset name carefully - only one of the lines should be uncommented amongst the 3

dataset_name = 'FNC_multiclass'
# dataset_name = 'FNC_binary'
# dataset_name = 'NELA_binary'

In [251]:
def load_dataset(name):
    if name == 'FNC_multiclass':
        train_bodies = pd.read_csv('./Data-Set/FNC-1/train_bodies.csv')
        train_stances = pd.read_csv('./Data-Set/FNC-1/train_stances.csv')
        test_bodies = pd.read_csv('./Data-Set/FNC-1/competition_test_bodies.csv')
        test_stances = pd.read_csv('./Data-Set/FNC-1/competition_test_stances.csv')
        labels = ['agree', 'disagree', 'discuss', 'unrelated']
        labels_dict = dict(zip(labels, range(len(labels))))
        train_dataset = pd.merge(train_stances, train_bodies, how='left', on='Body ID')
        train_dataset['Label'] = train_dataset['Stance'].map(lambda x: labels_dict[x])
        train_dataset.drop(columns=['Stance', 'Body ID'], axis=1, inplace=True)
        test_dataset = pd.merge(test_stances, test_bodies, how='left', on='Body ID')
        test_dataset['Label'] = test_dataset['Stance'].map(lambda x: labels_dict[x])
        test_dataset.drop(columns=['Stance', 'Body ID'], axis=1, inplace=True)
        return train_dataset, test_dataset
    
    elif name == 'FNC_binary':
        train_dataset = pd.read_csv('./Data-Set/FNC_Bin_Train.csv')
        train_dataset.drop(['Body ID'], axis=1, inplace=True)
        train_dataset.rename(columns={'Stance':'Label'}, inplace=True)
        test_dataset = pd.read_csv('./Data-Set/FNC_Bin_Test.csv')
        test_dataset.drop(['Body ID'], axis=1, inplace=True)
        test_dataset.rename(columns={'Stance':'Label'}, inplace=True)
        return train_dataset, test_dataset
    
    elif name == 'NELA_binary':
        train_dataset = pd.read_csv('./Data-Set/NELA_Train.csv')
        train_dataset.drop(['ID'], axis=1, inplace=True)
        train_dataset.rename(columns={'Body':'articleBody'}, inplace=True)
        test_dataset = pd.read_csv('./Data-Set/NELA_Test.csv')
        test_dataset.drop(['ID'], axis=1, inplace=True)
        test_dataset.rename(columns={'Body':'articleBody'}, inplace=True)
        return train_dataset, test_dataset

In [252]:
train_dataset, test_dataset = load_dataset(dataset_name)
train_dataset.head()

Unnamed: 0,Headline,articleBody,Label
0,"This week , explained : spies , special counse...",The Russia investigation got real this week . ...,0
1,Jolyon Palmer to leave Renault after Japanese ...,British driver Jolyon Palmer is to leave the R...,1
2,Former FBi Director James Comey Admits That He...,"Liberals have been frothing at the mouth , wai...",0
3,Trump : ' I have n't changed my stance ' on China,President Trump said Tuesday that he has n't s...,1
4,Trump Announces New Cabinet Member With Choice...,Newsmax reported Trump ’ s next move in the st...,0


In [253]:
train_dataset.shape, test_dataset.shape

((71420, 3), (6302, 3))

In [254]:
""" 
    Lemmatization is the process of grouping together the different inflected 
    forms of a word so they can be analyzed as a single item. Lemmatization is 
    similar to stemming but it brings context to the words. So it links words 
    with similar meanings to one word. 
    References - 
    https://www.geeksforgeeks.org/introduction-to-stemming/
    https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
    https://www.geeksforgeeks.org/nlp-how-tokenizing-text-sentence-words-works/
"""

from sklearn import feature_extraction
lemmatizer = nltk.WordNetLemmatizer()

"""
    Preprocesses a string: Lowercasing, trimming, 
    removing non-alphanumeric
"""
def preprocess(s):
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

def lemmatize_lowercase(w):
    return lemmatizer.lemmatize(w).lower()

"""
    split the sentence into words using word tokenizer and 
    then produce a lemmatized lowercase version of each word
"""
def word_tokenize_lemmatize_lowercase(s):
    return [lemmatize_lowercase(t) for t in nltk.word_tokenize(s)]


"""
    The stopwords are a list of words that are very very common but don’t 
    provide useful information for most text analysis procedures. That is, 
    these words are ignored during most natural language processing tasks, 
    such as part-of-speech tagging, tokenization and parsing.
    
    While they are helpful for understanding the structure of sentences, they do 
    not help you understand the semantics of the sentences themselves.
"""
def remove_stopwords(l):
    """
        Remove stopwords from the lemmatized list of words l
    """
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def unigrams(line):
    """
        Generate unigrams from a given string.
        returns a list of unigrams
    """
    line = preprocess(line)
    lemmatized_line = word_tokenize_lemmatize_lowercase(line)
    lemmatized_line_wo_stopwords = remove_stopwords(lemmatized_line)
    return lemmatized_line_wo_stopwords

def bigrams(line):     
    """
        Generate bigrams from a given string
        returns a list of bigrams where
        each bigram is represented as a list of 2 words
    """
    line = preprocess(line)
    lemmatized_line = word_tokenize_lemmatize_lowercase(line)
    lemmatized_line_wo_stopwords = remove_stopwords(lemmatized_line)
    output = []
    for i in range(len(lemmatized_line_wo_stopwords) - 1):
        output.append(lemmatized_line_wo_stopwords[i:i + 2])
        
    return output


def trigrams(line):
    """
        Generate trigrams from a given string
        returns a list of trigrams where
        each trigram is represented as a list of 3 words
    """
    line = preprocess(line)
    lemmatized_line = word_tokenize_lemmatize_lowercase(line)
    lemmatized_line_wo_stopwords = remove_stopwords(lemmatized_line)
    output = []
    for i in range(len(lemmatized_line_wo_stopwords) - 2):
        output.append(lemmatized_line_wo_stopwords[i:i + 3])
        
    return output

In [255]:
train_dataset = train_dataset.sample(n=5000, random_state=1)
test_dataset = test_dataset.sample(n=1000, random_state=1)

In [256]:
train_dataset.shape, test_dataset.shape

((5000, 3), (1000, 3))

In [257]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/pradnesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pradnesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [258]:
# generate unigrams
train_dataset["Headline_unigrams"] = train_dataset["Headline"].map(lambda x: unigrams(x))
print("headline unigrams train generated")
train_dataset["articleBody_unigrams"] = train_dataset["articleBody"].map(lambda x: unigrams(x))
print("body unigrams train generated")
test_dataset["Headline_unigrams"] = test_dataset["Headline"].map(lambda x: unigrams(x))
print("headline unigrams test generated")
test_dataset["articleBody_unigrams"] = test_dataset["articleBody"].map(lambda x: unigrams(x))
print("body unigrams test generated")

headline unigrams train generated
body unigrams train generated
headline unigrams test generated
body unigrams test generated


In [259]:
# # generate bigrams
# train_dataset["Headline_bigrams"] = train_dataset["Headline"].map(lambda x: bigrams(x))
# print("headline bigrams train generated")
# train_dataset["articleBody_bigrams"] = train_dataset["articleBody"].map(lambda x: bigrams(x))
# print("body bigrams train generated")
# test_dataset["Headline_bigrams"] = test_dataset["Headline"].map(lambda x: bigrams(x))
# print("headline bigrams test generated")
# test_dataset["articleBody_bigrams"] = test_dataset["articleBody"].map(lambda x: bigrams(x))
# print("body bigrams test generated")

In [260]:
# # generate trigrams
# train_dataset["Headline_trigrams"] = train_dataset["Headline"].map(lambda x: trigrams(x))
# print("headline trigrams train generated")
# train_dataset["articleBody_trigrams"] = train_dataset["articleBody"].map(lambda x: trigrams(x))
# print("body trigrams train generated")
# test_dataset["Headline_trigrams"] = test_dataset["Headline"].map(lambda x: trigrams(x))
# print("headline trigrams test generated")
# test_dataset["articleBody_trigrams"] = test_dataset["articleBody"].map(lambda x: trigrams(x))
# print("body trigrams test generated")

In [261]:
train_dataset.head()

Unnamed: 0,Headline,articleBody,Label,Headline_unigrams,articleBody_unigrams
49255,21K Illegal Aliens Arrested Under Trump So Far,According to newly released statistics by Immi...,1,"[21k, illegal, alien, arrested, trump, far]","[according, newly, released, statistic, immigr..."
55535,University of Wisconsin Passes Dangerous New P...,It ’ s good to know that the First Amendment i...,1,"[university, wisconsin, pass, dangerous, new, ...","[s, good, know, amendment, protected, college,..."
6633,No Diversity Here : Officially Recognized 'Fac...,Posts over the next several days will show tha...,1,"[diversity, officially, recognized, fact, chec...","[post, day, certain, left, leaning, website, e..."
68666,CONFIRMED : Syrian Arab Army takes control of ...,Fierce fighting with ISIS and US backed jihadi...,0,"[confirmed, syrian, arab, army, control, jorda...","[fierce, fighting, isi, u, backed, jihadist, s..."
9359,Jets are seeing a whole new side of Muhammad W...,According to David Z. Morris at Fortune magazi...,1,"[jet, seeing, new, muhammad, wilkerson]","[according, david, z, morris, fortune, magazin..."


In [262]:
"""
    Join the unigrams of headline and body as a single string
"""
def combine_headline_body_unigrams(x):
    res = '%s %s' % (' '.join(x['Headline_unigrams']), ' '.join(x['articleBody_unigrams']))
    return res

In [263]:
train_dataset["headline_plus_body"] = train_dataset.apply(combine_headline_body_unigrams, axis=1)
test_dataset["headline_plus_body"] = test_dataset.apply(combine_headline_body_unigrams, axis=1)

In [264]:
train_dataset.head()

Unnamed: 0,Headline,articleBody,Label,Headline_unigrams,articleBody_unigrams,headline_plus_body
49255,21K Illegal Aliens Arrested Under Trump So Far,According to newly released statistics by Immi...,1,"[21k, illegal, alien, arrested, trump, far]","[according, newly, released, statistic, immigr...",21k illegal alien arrested trump far according...
55535,University of Wisconsin Passes Dangerous New P...,It ’ s good to know that the First Amendment i...,1,"[university, wisconsin, pass, dangerous, new, ...","[s, good, know, amendment, protected, college,...",university wisconsin pass dangerous new policy...
6633,No Diversity Here : Officially Recognized 'Fac...,Posts over the next several days will show tha...,1,"[diversity, officially, recognized, fact, chec...","[post, day, certain, left, leaning, website, e...",diversity officially recognized fact checker l...
68666,CONFIRMED : Syrian Arab Army takes control of ...,Fierce fighting with ISIS and US backed jihadi...,0,"[confirmed, syrian, arab, army, control, jorda...","[fierce, fighting, isi, u, backed, jihadist, s...",confirmed syrian arab army control jordanian b...
9359,Jets are seeing a whole new side of Muhammad W...,According to David Z. Morris at Fortune magazi...,1,"[jet, seeing, new, muhammad, wilkerson]","[according, david, z, morris, fortune, magazin...",jet seeing new muhammad wilkerson according da...


In [265]:
# fit the tfidf vectorizer on the headline+body strings and
# calculate tfidf on all the text from training dataset
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), max_features=2500)
tfidf_vec.fit(train_dataset['headline_plus_body'])

TfidfVectorizer(max_features=2500, ngram_range=(1, 3))

In [266]:
tfidf_vocab = tfidf_vec.vocabulary_
print(len(list(tfidf_vocab.items())))
list(tfidf_vocab.items())[:5]

2500


[('illegal', 1153),
 ('alien', 126),
 ('arrested', 189),
 ('trump', 2299),
 ('far', 917)]

In [267]:
headline_vectorizer_tfidf = TfidfVectorizer(ngram_range=(1, 3), vocabulary=tfidf_vocab)
Features_Train_Headline_Tfidf = headline_vectorizer_tfidf.fit_transform(train_dataset['Headline_unigrams'].map(lambda x: ' '.join(x)))
Features_Test_Headline_Tfidf = headline_vectorizer_tfidf.transform(test_dataset['Headline_unigrams'].map(lambda x: ' '.join(x)))
Features_Train_Headline_Tfidf.shape, Features_Test_Headline_Tfidf.shape

((5000, 2500), (1000, 2500))

In [268]:
articleBody_vectorizer_tfidf = TfidfVectorizer(ngram_range=(1, 3), vocabulary=tfidf_vocab)
Features_Train_Body_Tfidf = articleBody_vectorizer_tfidf.fit_transform(train_dataset['articleBody_unigrams'].map(lambda x: ' '.join(x)))
Features_Test_Body_Tfidf = articleBody_vectorizer_tfidf.transform(test_dataset['articleBody_unigrams'].map(lambda x: ' '.join(x)))
Features_Train_Body_Tfidf.shape, Features_Test_Body_Tfidf.shape

((5000, 2500), (1000, 2500))

In [269]:
Features_Train_Headline_Tfidf.shape, Features_Train_Body_Tfidf.shape, Features_Test_Headline_Tfidf.shape, Features_Test_Body_Tfidf.shape

((5000, 2500), (5000, 2500), (1000, 2500), (1000, 2500))

In [270]:
type(Features_Train_Headline_Tfidf)

scipy.sparse.csr.csr_matrix

In [271]:
train_Features_Tfidf = [Features_Train_Headline_Tfidf.toarray(), Features_Train_Body_Tfidf.toarray()]
test_Features_Tfidf = [Features_Test_Headline_Tfidf.toarray(), Features_Test_Body_Tfidf.toarray()]

In [272]:
type(train_Features_Tfidf[0])

numpy.ndarray

In [273]:
train_x = np.hstack(train_Features_Tfidf)
test_x = np.hstack(test_Features_Tfidf)
type(train_x)

numpy.ndarray

In [274]:
train_x.shape

(5000, 5000)

In [275]:
train_y = train_dataset["Label"].values
test_y = test_dataset["Label"].values

In [276]:
# saving RAM
import gc #garbage collector
del train_dataset, test_dataset
del Features_Test_Headline_Tfidf, Features_Test_Body_Tfidf, Features_Train_Body_Tfidf, Features_Train_Headline_Tfidf
del train_Features_Tfidf, test_Features_Tfidf

gc.collect()

0

In [277]:
from sklearn import svm

In [278]:
# use svms linear svc - one vs all multiclass classification
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_x, train_y)

LinearSVC()

In [279]:
svm_predictions = svm_classifier.predict(test_x)
accuracy_svm = accuracy_score(test_y, svm_predictions)
accuracy_svm

0.546

In [280]:
from sklearn.naive_bayes import GaussianNB
gnb_classifier = GaussianNB()
gnb_classifier.fit(train_x, train_y)

GaussianNB()

In [281]:
gnb_predictions = gnb_classifier.predict(test_x)
accuracy_gnb = accuracy_score(test_y, gnb_predictions)
accuracy_gnb

0.495

In [282]:
from sklearn.tree import DecisionTreeClassifier
dectree_classifier = DecisionTreeClassifier(max_depth=2)
dectree_classifier.fit(train_x, train_y)

DecisionTreeClassifier(max_depth=2)

In [283]:
dectree_predictions = dectree_classifier.predict(test_x)
accuracy_dtree = accuracy_score(test_y, dectree_predictions)
accuracy_dtree

0.607

In [284]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(max_depth=2, random_state=0)
random_forest_classifier.fit(train_x, train_y)

RandomForestClassifier(max_depth=2, random_state=0)

In [285]:
rndforest_predictions = random_forest_classifier.predict(test_x)
accuracy_rndforest = accuracy_score(test_y, rndforest_predictions)
accuracy_rndforest

0.599

In [286]:
#################################################################
######### Experiment 2 begins - Feature Selection ###############
#################################################################

In [287]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

In [288]:
train_x.shape, train_y.shape

((5000, 5000), (5000,))

In [289]:
# using chi square for feature selection
# Note: we wont use test data and dev set data here for fitting chi_square
chi_square = SelectKBest(chi2, k=500)
chi_square.fit(train_x, train_y)
train_x_chi_square = chi_square.transform(train_x)
test_x_chi_square = chi_square.transform(test_x)
train_x_chi_square.shape, test_x_chi_square.shape

((5000, 500), (1000, 500))

In [290]:
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_x_chi_square, train_y)

LinearSVC()

In [291]:
# accuracy of the svm classifier increased slightly due to feature selection and it took less time to train
svm_predictions = svm_classifier.predict(test_x_chi_square)
accuracy_svm = accuracy_score(test_y, svm_predictions)
accuracy_svm

0.569

In [292]:
# accuracy of the naive bayes classifier increased slightly due to feature selection and it took less time to train
gnb_classifier = GaussianNB()
gnb_classifier.fit(train_x_chi_square, train_y)
gnb_predictions = gnb_classifier.predict(test_x_chi_square)
accuracy_gnb = accuracy_score(test_y, gnb_predictions)
accuracy_gnb

0.487

In [293]:
# accuracy of the decision tree classifier increased due to feature selection and it took less time to train
dectree_classifier = DecisionTreeClassifier(max_depth=2).fit(train_x_chi_square, train_y)
dectree_predictions = dectree_classifier.predict(test_x_chi_square)
accuracy_dtree = accuracy_score(test_y, dectree_predictions)
accuracy_dtree

0.607

In [294]:
# accuracy of the random forest classifier remained same due to feature selection and it took less time to train
random_forest_classifier = RandomForestClassifier(max_depth=2, random_state=0)
random_forest_classifier.fit(train_x_chi_square, train_y)
rndforest_predictions = random_forest_classifier.predict(test_x_chi_square)
accuracy_rndforest = accuracy_score(test_y, rndforest_predictions)
accuracy_rndforest

0.592

In [295]:
train_x.shape, train_y.shape

((5000, 5000), (5000,))

In [296]:
# using mutual information for feature selection
mutual_information = SelectKBest(mutual_info_classif, k = 500)
mutual_information.fit(train_x, train_y)
train_x_mutual_info = mutual_information.transform(train_x)
test_x_mutual_info = mutual_information.transform(test_x)
train_x_mutual_info.shape, test_x_mutual_info.shape

((5000, 500), (1000, 500))

In [297]:
# accuracy of the svm classifier increased due to feature selection and it took less time to train
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_x_mutual_info, train_y)
svm_predictions = svm_classifier.predict(test_x_mutual_info)
accuracy_svm = accuracy_score(test_y, svm_predictions)
accuracy_svm

0.573

In [298]:
# accuracy of the gnb classifier remained same due to feature selection and it took less time to train
gnb_classifier = GaussianNB()
gnb_classifier.fit(train_x_mutual_info, train_y)
gnb_predictions = gnb_classifier.predict(test_x_mutual_info)
accuracy_gnb = accuracy_score(test_y, gnb_predictions)
accuracy_gnb

0.486

In [299]:
# accuracy of the dectree classifier increased slightly due to feature selection and it took less time to train
dectree_classifier = DecisionTreeClassifier(max_depth=2).fit(train_x_mutual_info, train_y)
dectree_predictions = dectree_classifier.predict(test_x_mutual_info)
accuracy_dtree = accuracy_score(test_y, dectree_predictions)
accuracy_dtree

0.607

In [300]:
# accuracy of the random forest classifier remained same due to feature selection and it took less time to train
random_forest_classifier = RandomForestClassifier(max_depth=2, random_state=0)
random_forest_classifier.fit(train_x_mutual_info, train_y)
rndforest_predictions = random_forest_classifier.predict(test_x_mutual_info)
accuracy_rndforest = accuracy_score(test_y, rndforest_predictions)
accuracy_rndforest

0.584

In [301]:
######################################################################
############Experiment3 - PCA, SVD, Word embedding ###################
######################################################################

In [302]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 500)
pca.fit(train_x) 
# for dimension reduction, we fit only on the features and not labels unlike feature selection

PCA(n_components=500)

In [303]:
train_x_pca = pca.transform(train_x)
test_x_pca = pca.transform(test_x)
train_x_pca.shape, test_x_pca.shape

((5000, 500), (1000, 500))

In [304]:
# accuracy of the svm classifier increased slightly due to pca and it took less time to train
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_x_pca, train_y)
svm_predictions = svm_classifier.predict(test_x_pca)
accuracy_svm = accuracy_score(test_y, svm_predictions)
accuracy_svm

0.57

In [305]:
# accuracy of the gnb classifier increased slightly due to pca and it took less time to train
gnb_classifier = GaussianNB()
gnb_classifier.fit(train_x_pca, train_y)
gnb_predictions = gnb_classifier.predict(test_x_pca)
accuracy_gnb = accuracy_score(test_y, gnb_predictions)
accuracy_gnb

0.573

In [306]:
# accuracy of the dectree classifier decreased slightly due to pca and it took less time to train
dectree_classifier = DecisionTreeClassifier(max_depth=2)
dectree_classifier.fit(train_x_pca, train_y)
dectree_predictions = dectree_classifier.predict(test_x_pca)
accuracy_dtree = accuracy_score(test_y, dectree_predictions)
accuracy_dtree

0.581

In [307]:
# accuracy of the random forest classifier increased slightly due to pca and it took less time to train
random_forest_classifier = RandomForestClassifier(max_depth=2, random_state=0)
random_forest_classifier.fit(train_x_pca, train_y)
rndforest_predictions = random_forest_classifier.predict(test_x_pca)
accuracy_rndforest = accuracy_score(test_y, rndforest_predictions)
accuracy_rndforest

0.575

In [308]:
# lsa/lsi - perform dimensionality reduction using svd
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, n_iter=10, random_state=42)
svd.fit(train_x)

TruncatedSVD(n_components=500, n_iter=10, random_state=42)

In [309]:
train_x_svd = svd.transform(train_x)
test_x_svd = svd.transform(test_x)
train_x_svd.shape, test_x_svd.shape

((5000, 500), (1000, 500))

In [310]:
# accuracy of the svm classifier increased slightly due to svd and it took less time to train
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_x_svd, train_y)
svm_predictions = svm_classifier.predict(test_x_svd)
accuracy_svm = accuracy_score(test_y, svm_predictions)
accuracy_svm


0.582

In [311]:
# accuracy of the svm classifier increased slightly due to svd and it took less time to train
gnb_classifier = GaussianNB()
gnb_classifier.fit(train_x_svd, train_y)
gnb_predictions = gnb_classifier.predict(test_x_svd)
accuracy_gnb = accuracy_score(test_y, gnb_predictions)
accuracy_gnb

0.565

In [312]:
# accuracy of the dectree classifier increased slightly due to svd and it took less time to train
dectree_classifier = DecisionTreeClassifier(max_depth=2).fit(train_x_svd, train_y)
dectree_predictions = dectree_classifier.predict(test_x_svd)
accuracy_dtree = accuracy_score(test_y, dectree_predictions)
accuracy_dtree

0.602

In [313]:
# accuracy of the random forest classifier increased slightly due to svd and it took less time to train
random_forest_classifier = RandomForestClassifier(max_depth=2, random_state=0)
random_forest_classifier.fit(train_x_svd, train_y)
rndforest_predictions = random_forest_classifier.predict(test_x_svd)
accuracy_rndforest = accuracy_score(test_y, rndforest_predictions)
accuracy_rndforest

0.585

In [314]:
####################################################
#############  Word Embedding ######################
####################################################

In [315]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction
from sklearn.metrics import accuracy_score
import re
import nltk

In [316]:
train_dataset, test_dataset = load_dataset(dataset_name)
train_dataset.head()

Unnamed: 0,Headline,articleBody,Label
0,"This week , explained : spies , special counse...",The Russia investigation got real this week . ...,0
1,Jolyon Palmer to leave Renault after Japanese ...,British driver Jolyon Palmer is to leave the R...,1
2,Former FBi Director James Comey Admits That He...,"Liberals have been frothing at the mouth , wai...",0
3,Trump : ' I have n't changed my stance ' on China,President Trump said Tuesday that he has n't s...,1
4,Trump Announces New Cabinet Member With Choice...,Newsmax reported Trump ’ s next move in the st...,0


In [317]:
train_dataset.shape, test_dataset.shape

((71420, 3), (6302, 3))

In [318]:
train_dataset = train_dataset.sample(n=5000, random_state=1)
test_dataset = test_dataset.sample(n=1000, random_state=1)

In [319]:
"""
    Preprocesses a string: Trimming, removing non-alphanumeric, lowercasing
"""
def preprocess(s):
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

"""
    split the sentence into words using word tokenizer
"""
def word_tokenize(s):
    return [t for t in nltk.word_tokenize(s)]


"""
    The stopwords are a list of words that are very very common but don’t 
    provide useful information for most text analysis procedures. That is, 
    these words are ignored during most natural language processing tasks, 
    such as part-of-speech tagging, tokenization and parsing.
    
    While they are helpful for understanding the structure of sentences, they do 
    not help you understand the semantics of the sentences themselves.
"""
def remove_stopwords(l):
    """
        Remove stopwords from the lemmatized list of words l
    """
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def unigrams(line):
    """
        Generate unigrams from a given string.
        returns a list of unigrams
    """
    line = preprocess(line)
    tokenized_line = word_tokenize(line)
    tokenized_line_wo_stopwords = remove_stopwords(tokenized_line)
    return tokenized_line_wo_stopwords

In [320]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/pradnesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pradnesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [321]:
# generate unigrams
train_dataset["Headline_unigrams"] = train_dataset["Headline"].map(lambda x: unigrams(x))
print("headline unigrams train generated")
train_dataset["articleBody_unigrams"] = train_dataset["articleBody"].map(lambda x: unigrams(x))
print("body unigrams train generated")
test_dataset["Headline_unigrams"] = test_dataset["Headline"].map(lambda x: unigrams(x))
print("headline unigrams test generated")
test_dataset["articleBody_unigrams"] = test_dataset["articleBody"].map(lambda x: unigrams(x))
print("body unigrams test generated")

headline unigrams train generated
body unigrams train generated
headline unigrams test generated
body unigrams test generated


In [322]:
"""
    Join the unigrams of headline and body as a single string
"""
def combine_headline_body_unigrams(x):
    res = '%s %s' % (' '.join(x['Headline_unigrams']), ' '.join(x['articleBody_unigrams']))
    return res

In [323]:
train_dataset["headline_plus_body"] = train_dataset.apply(combine_headline_body_unigrams, axis=1)
test_dataset["headline_plus_body"] = test_dataset.apply(combine_headline_body_unigrams, axis=1)

In [324]:
# saving RAM
import gc #garbage collector

train_dataset.drop(['Headline_bigrams', 'articleBody_bigrams', 'Headline_unigrams', 'articleBody_unigrams', 'Headline', 'articleBody'], axis=1, errors='ignore', inplace=True)
test_dataset.drop(['Headline_bigrams', 'articleBody_bigrams', 'Headline_unigrams', 'articleBody_unigrams', 'Headline', 'articleBody'], axis=1, errors='ignore', inplace=True)

gc.collect()

672

In [325]:
train_dataset.head()

Unnamed: 0,Label,headline_plus_body
49255,1,21k illegal aliens arrested trump far accordin...
55535,1,university wisconsin passes dangerous new poli...
6633,1,diversity officially recognized fact checkers ...
68666,0,confirmed syrian arab army takes control jorda...
9359,1,jets seeing new muhammad wilkerson according d...


In [326]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return word_to_vec_map

In [327]:
word_to_vec_map = read_glove_vecs('./Data-Set/glove.6B.300d.txt')
type(word_to_vec_map)

dict

In [328]:
def sentence_to_avg(sentence):
    """
        sentence is already preprocessed
        we represent the sentence by taking the average of its words' embeddings
    """
    words = sentence.split()
    avg = np.zeros(300)
    count = 0
    
    for w in words:
        if w in word_to_vec_map.keys():
            count += 1
            avg += word_to_vec_map[w]
            
    if count > 0:
        avg = avg / count
        
    return avg

In [329]:
train_dataset["sentence embedding"] = train_dataset["headline_plus_body"].map(lambda x: sentence_to_avg(x))
test_dataset["sentence embedding"] = test_dataset["headline_plus_body"].map(lambda x: sentence_to_avg(x))

train_dataset.head()

Unnamed: 0,Label,headline_plus_body,sentence embedding
49255,1,21k illegal aliens arrested trump far accordin...,"[-0.017819887739463573, 0.009724260919540227, ..."
55535,1,university wisconsin passes dangerous new poli...,"[-0.001058549019607837, -0.06027169187675072, ..."
6633,1,diversity officially recognized fact checkers ...,"[-0.07587987692307689, 0.09218232347140036, 0...."
68666,0,confirmed syrian arab army takes control jorda...,"[0.04090239049999999, -0.0015157960000000124, ..."
9359,1,jets seeing new muhammad wilkerson according d...,"[-0.030955657575757575, 0.07464621212121214, 0..."


In [330]:
train_x = train_dataset["sentence embedding"].tolist()
train_x = np.asarray(train_x)

test_x = test_dataset["sentence embedding"].tolist()
test_x = np.asarray(test_x)

train_x.shape, test_x.shape

((5000, 300), (1000, 300))

In [331]:
train_y = train_dataset["Label"].values
test_y = test_dataset["Label"].values

train_y.shape, test_y.shape

((5000,), (1000,))

In [332]:
# we no longer require train_dataset and test_dataset, thus we can free up some RAM
import gc

del train_dataset, test_dataset
gc.collect()

0

In [341]:
from sklearn import svm
svm_classifier = svm.LinearSVC()
svm_classifier.fit(train_x, train_y)

LinearSVC()

In [342]:
svm_predictions = svm_classifier.predict(test_x)
accuracy_svm = accuracy_score(test_y, svm_predictions)
accuracy_svm

0.482

In [335]:
from sklearn.naive_bayes import GaussianNB
gnb_classifier = GaussianNB()
gnb_classifier.fit(train_x, train_y)

GaussianNB()

In [336]:
gnb_predictions = gnb_classifier.predict(test_x)
accuracy_gnb = accuracy_score(test_y, gnb_predictions)
accuracy_gnb

0.62

In [337]:
# The accuracy of DecisionTree
from sklearn.tree import DecisionTreeClassifier
dectree_classifier = DecisionTreeClassifier(max_depth=2)
dectree_classifier.fit(train_x, train_y)

DecisionTreeClassifier(max_depth=2)

In [338]:
dectree_predictions = dectree_classifier.predict(test_x)
accuracy_dtree = accuracy_score(test_y, dectree_predictions)
accuracy_dtree

0.549

In [339]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(max_depth=2, random_state=0)
random_forest_classifier.fit(train_x, train_y)

RandomForestClassifier(max_depth=2, random_state=0)

In [340]:
rndforest_predictions = random_forest_classifier.predict(test_x)
accuracy_rndforest = accuracy_score(test_y, rndforest_predictions)
accuracy_rndforest

0.604