In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from StopWords import stop_words
from collections import defaultdict
import math

from IPython.display import display

test_data = pd.read_csv('./src/test.csv')
test_data_content_np = test_data['content'].as_matrix()

def clean_html(data):
    print('> cleaning html tags....')
    for i in range(len(data)):
        data[i] = BeautifulSoup(data[i],'html.parser').get_text().replace('\n',' ')
        if not i % 100:
            print('> clean_html proceeding: ' + str(i) + '/' + str(len(data)))

    return data

def delete_stopwords(data):
    return [word for word in data if word not in stop_words]

def setup_text(data):
    print('> setting up text....')
    data_list = list()
    for i, row in enumerate(data):
        data_list.append(delete_stopwords([word for word in row.lower().split(' ') if word.isalpha()]))
        if not i % 100:
            print('> setup_text proceeding: ' + str(i) + '/' + str(len(data)))

    return data_list

def build_tf_table(data):
    print('> building tf_table....')
    tf_table = list()
    for i, row in enumerate(data):
        word_count = 0
        tfFreqDict = defaultdict(int)
        for word in row:
            tfFreqDict[word] += 1
            word_count += 1
        display(tfFreqDict)
        for word in tfFreqDict:
            tfFreqDict[word] = tfFreqDict[word] / word_count

        tf_table.append(tfFreqDict)
        if not i % 100:
            print('> tf_table proceeding: ' + str(i) + '/' + str(len(data)))

    return tf_table

def build_idf_table(word_list, data):
    print('> building idf_table....')
    idf_table = defaultdict(int)
    for word in word_list:
        idf_table[word] = 0

    for i, row in enumerate(data):
        for word in set(row):
            idf_table[word] += 1
        if not i % 100:
            print('> idf_table proceeding: ' + str(i) + '/' + str(len(data)))

    for word in idf_table:
        idf_table[word] = math.log((len(data)+1)/(idf_table[word]))

    return idf_table

def create_word_list(data):
    print('> creating word list....')
    word_list = list()
    for i, row in enumerate(data):
        word_list = word_list + row
        print('> creating word_list: ' + str(i) + '/' + str(len(data)))

    return set(word_list)

def build_tfidf_table(data):
    print('> starting tfidf_table....')
    tf_table = build_tf_table(data)
    display(tf_table)
    idf_table = build_idf_table(create_word_list(data), data)
    display(idf_table)

    print('> building tfidf_table....')
    tfidf_table = list()
    for i, row in enumerate(tf_table):
        tfidfDict = defaultdict(int)
        for word in row:
            tfidfDict[word] = row[word]*idf_table[word]

        tfidf_table.append(tfidfDict)
        if not i % 100:
            print('> tfidf_table proceeding: ' + str(i) + '/' + str(len(tf_table)))

    return tfidf_table



content_data = setup_text(clean_html(test_data_content_np))
display(build_tfidf_table(content_data))


> cleaning html tags....
> clean_html proceeding: 0/81926
> clean_html proceeding: 100/81926
> clean_html proceeding: 200/81926
> clean_html proceeding: 300/81926
> clean_html proceeding: 400/81926
> clean_html proceeding: 500/81926
> clean_html proceeding: 600/81926
> clean_html proceeding: 700/81926
> clean_html proceeding: 800/81926
> clean_html proceeding: 900/81926
> clean_html proceeding: 1000/81926
> clean_html proceeding: 1100/81926
> clean_html proceeding: 1200/81926
> clean_html proceeding: 1300/81926
> clean_html proceeding: 1400/81926
> clean_html proceeding: 1500/81926
> clean_html proceeding: 1600/81926
> clean_html proceeding: 1700/81926
> clean_html proceeding: 1800/81926
> clean_html proceeding: 1900/81926
> clean_html proceeding: 2000/81926
> clean_html proceeding: 2100/81926
> clean_html proceeding: 2200/81926
> clean_html proceeding: 2300/81926
> clean_html proceeding: 2400/81926
> clean_html proceeding: 2500/81926
> clean_html proceeding: 2600/81926
> clean_html pr

KeyboardInterrupt: 