In [328]:
import csv
from nltk import *
import numpy as np
from collections import defaultdict
import pandas as pd
from math import *

train_file = "rocchio_benchmark_train.csv"
test_file = "rocchio_benchmark_test.csv"

def read_data_file(data_file, data_type):
    with open(data_file, 'rU') as csv_file:
        reader = csv.reader(csv_file)
        data = []
        if data_type == 'train':
            for row in reader:
                doc = dict()
                doc['text'] = process_text(row[0])
                doc['category'] = row[1]
                data.append(doc)
        elif data_type == 'test':   
            data = [row[0] for row in reader]
    return data

def process_text(text):
    text = word_tokenize(text)     # tokenize text
    text = [SnowballStemmer("english").stem(t) for t in text]  # get stems
    return text

def tf_idf(term, doc, N, word_counts):
    term_freq = doc[term]
    max_term_freq = max(doc.values())
    TF = 0.5 + 0.5*(term_freq/max_term_freq)
    
    def term_freq_in_corpus(term, word_counts):
        freq_in_corpus = 0
        for doc in word_counts:
            if term in doc['text']:
                freq_in_corpus += 1
        return freq_in_corpus

    n = term_freq_in_corpus(term, word_counts)
    if n == 0:
        n = 1
    IDF = log(N/n, 10)   # log in base of 10
    return TF*IDF


def get_vocabulary(bag_of_words):
    return filter(lambda term: term != 'CATEGORY', bag_of_words.columns.values)


def get_word_counts(text):
    counts = defaultdict(int)
    for term in text:
        counts[term] += 1
    return counts

def get_word_counts_in_corpus(data):
    word_counts = []
    for doc in data:
        counts = get_word_counts(doc['text'])
        doc_counts = {'text': counts, 'category': doc['category']}
        word_counts.append(doc_counts)

    return word_counts


def get_weights_vector(doc, N, word_counts):
    vector_weights = dict()
    for term in doc:
        vector_weights[term] = tf_idf(term, doc, N, word_counts)
    return vector_weights


def get_vector_space_model(word_counts):
    N = len(word_counts)
    vector_space = []
    for doc in word_counts:
        vector_weights = get_weights_vector(doc['text'], N, word_counts)
        doc_weights = {'text': vector_weights, 'category': doc['category']}
        vector_space.append(doc_weights)
        
    return vector_space

def get_bag_of_words(data):
    data_for_df = []
    for doc in data:
        text = doc['text']
        text['CATEGORY'] = doc['category']
        data_for_df.append(text)
        
    bag_of_words = pd.DataFrame(data_for_df).fillna(0)
    return bag_of_words

"""
def get_term_weight_in_prototype(term, docs_in_category, docs_out_category):
    b = 16
    c = 4
    category_size = float(len(docs_in_category))
    other_categories_size = float(len(docs_out_category))

    avg_term_weight_in_category = docs_in_category[term].sum()/category_size
    avg_term_weight_out_category = docs_out_category[term].sum()/other_categories_size

    prototype_term_weight = b*avg_term_weight_in_category - c*avg_term_weight_out_category
    if prototype_term_weight < 0:
        prototype_term_weight = 0
    return prototype_term_weight


def get_prototype_vector(bag_of_words):
    PROTOTYPES = dict()
    categories = bag_of_words['CATEGORY'].unique()
    
    for category in categories:
        prototype = dict()
        docs_in_category = bag_of_words[bag_of_words['CATEGORY'] == category]
        docs_out_category = bag_of_words[bag_of_words['CATEGORY'] != category]
        
        for term in docs_in_category:
            if term != 'CATEGORY':
                prototype[term] = get_term_weight_in_prototype(term, docs_in_category, docs_out_category)
        PROTOTYPES[category] = prototype
    
    return PROTOTYPES

"""
def get_centroids(bag_of_words):
    centroids = dict()
    categories = bag_of_words['CATEGORY'].unique()
    vocabulary = get_vocabulary(bag_of_words) 
    
    for category in categories:
        docs_in_category = bag_of_words[bag_of_words['CATEGORY'] == category]
        docs_in_category = docs_in_category[vocabulary]
        centroid = {term: docs_in_category[term].mean() for term in docs_in_category}
        centroids[category] = centroid
    return centroids

__SIMPLE CENTROID CLASSIFIER__

In [326]:
def train(train_data):
    word_counts = get_word_counts_in_corpus(train_data)
    vector_space = get_vector_space_model(word_counts)
    bag_of_words = get_bag_of_words(vector_space)
    vocabulary = get_vocabulary(bag_of_words) 
    centroids = get_centroids(bag_of_words)
    return centroids

In [324]:
def classify(test_data):
    predictions = []
    
    for doc in test_data:
        text = process_text(doc)
        present_in_vocabulary = False
        for term in text:
            if term in vocabulary:
                present_in_vocabulary = True
                break

        if not present_in_vocabulary:
            print "ERROR: This text can not be classified. Any of terms is not present in vocabulary of Classifier."
            prediction = None
        else:
            counts = get_word_counts(text)
            weights_vector = get_weights_vector(counts, len(word_counts), word_counts)
            weights_vector = {term: weights_vector[term] if term in weights_vector else 0.0 for term in vocabulary}

            magnitudes = []
            for category in centroids:
                centroid = centroids[category]
                difference = [centroid[term]- weights_vector[term] for term in centroid]
                magnitude = sqrt(sum([x**2 for x in difference]))
                magnitudes.append((magnitude, category))

            prediction = sorted(magnitudes, key=lambda tup: tup[0])[0][1]
        predictions.append({'text': doc, 'category': prediction})
        
    return predictions

In [327]:
""" MAIN """
train_data = read_data_file(train_file, 'train')
centroids = train(train_data)

test_data = read_data_file(test_file, 'test')
predictions = classify(test_data)
predictions

[{'category': 'negative', 'text': 'Such a good day'},
 {'category': 'negative', 'text': 'I feel so bad'},
 {'category': 'negative', 'text': 'My best friend'},
 {'category': 'positive', 'text': 'I love this song'}]