In [1]:
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

def pre_process():
    data = pd.read_csv('WELFake_Dataset.csv', index_col=0)
    print(data.shape)
    # display(data[:300])
    for i,x in data.iterrows():
        if len(str(x["text"])) <= 10:
            data.loc[i, "text"] = np.nan
        if len(str(x["title"])) <= 10:
            data.loc[i, "title"] = np.nan

    data.dropna(inplace=True)
    print(data.shape)
    data.reset_index(drop=True, inplace=True)
    data.to_csv("data/data.csv")
    display(data[:300])

def tokenize():
    stop = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    punc = [u'\u201c',u'\u201d',u'\u2018',u'\u2019',u'\u2024',u'\u2025',u'\u2026',u'\u2027']
    print(punc)
    data = pd.read_csv('data/data.csv', index_col=0)
    data_cleaned = data.copy()
    titles = list()
    texts = list()
    for i, row in data.iterrows():
        title = str(row["title"])
        text = str(row["text"])
        t1 = ""
        for c in title:
            if not (c in string.punctuation or c in punc):
                t1 += c
            else:
                t1 += " "
        t2 = ""
        for c in text:
            if not (c in string.punctuation or c in punc):
                t2 += c
            else:
                t2 += " "
        title_tokens = nltk.tokenize.word_tokenize(t1)
        text_tokens = nltk.tokenize.word_tokenize(t2)
        # title_filtered = [w.lower() for w in title_tokens if not w.lower() in string.punctuation]
        # title_filtered = [w.lower() for w in title_filtered if not w.lower() in punc]
        title_filtered = [w.lower() for w in title_tokens if not w.lower() in stop]
        title_stemmed = [stemmer.stem(w) for w in title_filtered]
        # text_filtered = [w.lower() for w in text_tokens if not w.lower() in string.punctuation]
        # text_filtered = [w.lower() for w in text_filtered if not w.lower() in punc]
        text_filtered = [w.lower() for w in text_tokens if not w.lower() in stop]
        text_stemmed = [stemmer.stem(w) for w in text_filtered]
        # print(title_stemmed)
        # print(text_stemmed)
        titles.append(title_stemmed)
        texts.append(text_stemmed)
    data_cleaned["title"] = titles
    data_cleaned["text"] = texts
    data_cleaned.to_csv("data/data_token.csv")


## Bag of Words

In [2]:
def make_bow(data_path):
    data = pd.read_csv(data_path, index_col=0)
    bow = []
    bow_title = []
    bow_text = []
    bow_both = []
    for i, row in data.iterrows():
        words = row["title"].split(",")
        title = []
        for word in words:
            title.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
        words = row["text"].split(",")
        text = []
        for word in words:
            text.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
        dic_title = {}
        dic_text = {}
        dic_both = {}
        for word in title:
            if word in dic_title:
                dic_title[word] = dic_title[word] + 1
            else:
                dic_title[word] = 1
            if word in dic_both:
                dic_both[word] = dic_both[word] + 1
            else:
                dic_both[word] = 1
        for word in text:
            if word in dic_text:
                dic_text[word] = dic_text[word] + 1
            else:
                dic_text[word] = 1
            if word in dic_both:
                dic_both[word] = dic_both[word] + 1
            else:
                dic_both[word] = 1
        bow_text.append(dic_text)
        bow_title.append(dic_title)
        bow_both.append(dic_both)
    bow.append(bow_title)
    bow.append(bow_text)
    bow.append(bow_both)
    return bow

In [3]:
# bow = [bow_title[],bow_text[],bow_both[]]
bow = make_bow('data_tokenized/data_token.csv')

## TFIDF with Cosine

In [4]:
import math
def tf(bow_):
    tf_ = []
    for dic in bow_:
        max_ = 0
        for i in dic:
            if dic[i] > max_:
                max_ = dic[i]
        tf_dic = {}
        for word in dic:
            tf_dic[word] = dic[word]/max_
        tf_.append(tf_dic)
    return tf_

def idf(bow_):
    df_ = {}
    for dic in bow_:
        for word in dic:
            if word in df_:
                df_[word] += 1
            else:
                df_[word] = 1
    idf_ = {}
    for word in df_:
        idf_[word] = math.log10(len(bow)/df_[word])
    return idf_

def tf_idf(bow_):
    tf_ = tf(bow_)
    idf_ = idf(bow_)
    tfidf = []
    for dic in tf_:
        tfidf_dic = {}
        for word in dic:
            tfidf_dic[word] = dic[word] * idf_[word]
        tfidf.append(tfidf_dic)
    return tfidf

def cosineSim(dic_a, dic_b):
    for word in dic_a:
        if word not in dic_b:
            dic_b[word] = 0
    for word in dic_b:
        if word not in dic_a:
            dic_a[word] = 0
    dot, sum_a, sum_b = 0,0,0
    for word in dic_a:
        a = dic_a[word]
        b = dic_b[word]
        dot += (a*b)
        sum_a += math.pow(a,2)
        sum_b += math.pow(b,2)
    sqrt_sum_a = math.sqrt(sum_a)
    sqrt_sum_b = math.sqrt(sum_b)
    return dot / (sqrt_sum_a * sqrt_sum_b)

def tfidf_cosine_ranking(word_, bow_):
    tfidf_all = tf_idf(bow_)
    list_query = [{word_: 1}]
    tfidf_query = tf_idf(list_query)[0]
    article_index = []
    cosSim = []
    cos_index = 0
    for a in tfidf_all:
        article_index.append(cos_index)
        cosSim.append(cosineSim(a,tfidf_query))
        cos_index += 1
    return pd.DataFrame({'article': article_index ,'value': cosSim }).sort_values(by=['value'], ascending=False)



In [5]:
#cos_rank = tfidf_cosine_ranking('obama',bow[2])
#print(cos_rank.head(5))

## bm25

In [6]:
from rank_bm25 import BM25Okapi
def bm25_ranking(query_,index_):
    data = pd.read_csv('data_tokenized/data_token.csv', index_col=0)
    corpus = []
    title = []
    text = []
    both = []
    for i, row in data.iterrows():
            words = row["title"].split(",")
            for word in words:
                title.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
            words = row["text"].split(",")
            for word in words:
                text.append(str(word).replace("'", "").replace(" ", "").replace("[", "").replace("]", ""))
    if index_ == 0:
        corpus = title
    elif index_ == 1:
        corpus = text
    else:
        for i in title:
            both.append(title + text)
        corpus = both

    print("Starting bm25")
    bm25 = BM25Okapi(corpus)
    bm25_scores = bm25.get_scores(query_.split(" "))

    article_index = []
    bm25_index = 0
    for a in bm25_scores:
        article_index.append(bm25_index)
        bm25_index += 1
    return pd.DataFrame({'article': article_index ,'value': bm25_scores }).sort_values(by=['value'], ascending=False)


In [None]:
bm25_rank = bm25_ranking('sunday',bow[2])
print(bm25_rank.head(5))