In [200]:
import csv
from nltk import *
import numpy as np
from collections import defaultdict
import pandas as pd
from math import log

test_data = "roccio_benchmark.csv"

def read_data_file(data_file):
    """
        Input:  csv file with sets of items in each row (item appears only once per row),
        Output: list of lists
    """
    with open(data_file, 'rU') as csv_file:
        reader = csv.reader(csv_file)
        # data = [{'text': row[0], 'class':row[1]} for row in reader]
        data = []
        for row in reader:
            doc = dict()
            doc['category'] = row[1]
            text = row[0]                       # extract text
            text = word_tokenize(text)     # tokenize text
            text = [SnowballStemmer("english").stem(t) for t in text]  # get stems
            doc['text'] = text
            data.append(doc)
    return data


def tf_idf(term, doc, N, word_counts):
    term_freq = doc[term]
    max_term_freq = max(doc.values())
    TF = 0.5 + 0.5*(term_freq/max_term_freq)
    
    def term_freq_in_corpus(term, word_counts):
        freq_in_corpus = 0
        for doc in word_counts:
            if term in doc['text']:
                freq_in_corpus += 1
        return freq_in_corpus

    n = term_freq_in_corpus(term, word_counts)
    if n == 0:
        n = 1
    IDF = log(N/n, 10)   # log in base of 10
    return TF*IDF

"""
def get_vocabulary(DOCUMENTS):
    return sorted(list(reduce(lambda x,y: x.union(y), [set(doc) for doc in DOCUMENTS])))


"""
def get_word_counts(data):
    word_counts = []
    for doc in data:
        counts = defaultdict(int)
        for term in doc['text']:
            counts[term] += 1
        doc_counts = {'text': counts, 'category': doc['category']}
        word_counts.append(doc_counts)

    return word_counts

def get_vector_space_model(word_counts):
    N = len(word_counts)
    vector_space = []
    for doc in word_counts:
        vector_weights = dict()
        for term in doc['text']:
            vector_weights[term] = tf_idf(term, doc['text'], N, word_counts)
        doc_weights = {'text': vector_weights, 'category': doc['category']}
        vector_space.append(doc_weights)
        
    return vector_space

def get_bag_of_words(data):
    data_for_df = []
    for doc in data:
        text = doc['text']
        text['CATEGORY'] = doc['category']
        data_for_df.append(text)
        
    bag_of_words = pd.DataFrame(data_for_df).fillna(0)
    return bag_of_words


def get_term_weight_in_prototype(term, docs_in_category, docs_out_category):
    b = 16
    c = 4
    category_size = float(len(docs_in_category))
    other_categories_size = float(len(docs_out_category))

    avg_term_weight_in_category = docs_in_category[term].sum()/category_size
    avg_term_weight_out_category = docs_out_category[term].sum()/other_categories_size

    prototype_term_weight = b*avg_term_weight_in_category - c*avg_term_weight_out_category
    if prototype_term_weight < 0:
        prototype_term_weight = 0
    return prototype_term_weight


def get_prototype_vector(bag_of_words):
    PROTOTYPES = dict()
    categories = set([doc['category'] for doc in data])
    
    for category in categories:
        prototype = dict()
        docs_in_category = bag_of_words[bag_of_words['CATEGORY'] == category]
        docs_out_category = bag_of_words[bag_of_words['CATEGORY'] != category]
        
        for term in docs_in_category:
            if term != 'CATEGORY':
                prototype[term] = get_term_weight_in_prototype(term, docs_in_category, docs_out_category)
        PROTOTYPES[category] = prototype
    
    return PROTOTYPES

"""
def group_data_by_categories(data):
    categories = set([doc['category'] for doc in data])
    data_by_categories = {}
    for c in categories:
        doc_set = filter(lambda doc: doc['category']==c, data)
        data_by_categories[c] = [doc['text'] for doc in doc_set]
    return data_by_categories    
        
vector_space_by_categories = group_data_by_categories(vector_space)
"""

In [201]:
""" MAIN """
data = read_data_file(test_data)
word_counts = get_word_counts(data)
vector_space = get_vector_space_model(word_counts)
bag_of_words = get_bag_of_words(vector_space)
PROTOTYPES = get_prototype_vector(bag_of_words)
