In [126]:
import csv
from nltk import *
import numpy as np
from collections import defaultdict
import pandas as pd
from math import log

test_data = "roccio_benchmark.csv"

def read_data_file(data_file):
    """
        Input:  csv file with sets of items in each row (item appears only once per row),
        Output: list of lists
    """
    with open(data_file, 'rU') as csv_file:
        reader = csv.reader(csv_file)
        # data = [{'text': row[0], 'class':row[1]} for row in reader]
        data = []
        for row in reader:
            doc = dict()
            doc['category'] = row[1]
            text = row[0]                       # extract text
            text = word_tokenize(text)     # tokenize text
            text = [SnowballStemmer("english").stem(t) for t in text]  # get stems
            doc['text'] = text
            data.append(doc)
    return data


def tf_idf(term, doc, N, word_counts):
    term_freq = doc[term]
    max_term_freq = max(doc.values())
    TF = 0.5 + 0.5*(term_freq/max_term_freq)
    
    def term_freq_in_corpus(term, word_counts):
        freq_in_corpus = 0
        for doc in word_counts:
            if term in doc['text']:
                freq_in_corpus += 1
        return freq_in_corpus

    n = term_freq_in_corpus(term, word_counts)
    if n == 0:
        n = 1
    IDF = log(N/n, 10)   # log in base of 10
    return TF*IDF

"""
def get_vocabulary(DOCUMENTS):
    return sorted(list(reduce(lambda x,y: x.union(y), [set(doc) for doc in DOCUMENTS])))

def get_bag_of_words(word_counts):
    bag_of_words = pd.DataFrame(word_counts).fillna(0)
    return bag_of_words
"""
def get_word_counts(data):
    word_counts = []
    for doc in data:
        counts = defaultdict(int)
        for term in doc['text']:
            counts[term] += 1
        doc_counts = {'text': counts, 'category': doc['category']}
        word_counts.append(doc_counts)

    return word_counts

def get_vector_space_model(word_counts):
    N = len(word_counts)
    vector_space = []
    for doc in word_counts:
        vector_weights = dict()
        for term in doc['text']:
            vector_weights[term] = tf_idf(term, doc['text'], N, word_counts)
        doc_weights = {'text': vector_weights, 'category': doc['category']}
        vector_space.append(doc_weights)
        
    return vector_space

In [89]:
#vocabulary = get_vocabulary(DOCUMENTS)
# bag_of_words = get_bag_of_words(word_counts)

In [100]:
data = read_data_file(test_data)
DOCUMENTS = [doc['text'] for doc in data]
word_counts = get_word_counts(DOCUMENTS)
vector_space = get_vector_space_model(word_counts)

In [129]:
data = read_data_file(test_data)
word_counts = get_word_counts(data)
vector_space = get_vector_space_model(word_counts)

for index, row in bag_of_words.iterrows():
    row_as_list = list(row)
    break

In [151]:
def group_data_by_categories(data):
    categories = set([doc['category'] for doc in data])
    data_by_categories = {}
    for c in categories:
        doc_set = filter(lambda doc: doc['category']==c, data)
        data_by_categories[c] = [doc['text'] for doc in doc_set]
    return data_by_categories    
        
vector_space_by_categories = group_data_by_categories(vector_space)

In [None]:
def get_prototype_vector(category, doc_set):
    

In [150]:
prototypes = {}
for category in word_counts_by_categories:
    prototypes[category] = get_prototype_vector(category)

positive
negative


In [152]:
vector_space_by_categories['positive']

[{'i': 0.47712125471966244,
  u'love': 0.6989700043360187,
  u'song': 0.6989700043360187,
  u'this': 0.6989700043360187},
 {u'extremali': 1.1760912590556811,
  u'love': 0.6989700043360187,
  u'peopl': 0.8450980400142567},
 {u'good': 0.8450980400142567,
  'is': 0.47712125471966244,
  u'the': 1.1760912590556811,
  u'weather': 1.1760912590556811},
 {u'good': 0.8450980400142567,
  u'job': 0.6989700043360187,
  u'man': 1.1760912590556811},
 {'a': 0.47712125471966244,
  u'found': 0.8450980400142567,
  'i': 0.47712125471966244,
  u'job': 0.6989700043360187,
  u'new': 0.8450980400142567},
 {u'are': 1.1760912590556811,
  u'friend': 0.8450980400142567,
  u'her': 1.1760912590556811,
  u'interest': 0.8450980400142567,
  u'peopl': 0.8450980400142567,
  u'veri': 0.8450980400142567},
 {'a': 0.47712125471966244,
  u'found': 0.8450980400142567,
  'i': 0.47712125471966244,
  u'new': 0.8450980400142567,
  u'song': 0.6989700043360187},
 {u'book': 1.1760912590556811,
  u'interest': 0.8450980400142567,
  'i