# Start

In [23]:
import csv
import pandas as pd
import os
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

target_file = 'reviews.csv'

test_data = []
train_data = []


def convert_file():
    if os.path.isfile(target_file):
        return

    with open('reviews.tsv','rb') as fin:
        cr = csv.reader(fin, delimiter='\t')
        filecontents = [line for line in cr]

    with open(target_file,'wb') as fou:
        cw = csv.writer(fou, quoting=csv.QUOTE_ALL,escapechar='\\')
        cw.writerows(filecontents)

def divide_by_rankings():
    colnames = ['rating', 'text']
    contents = pd.read_csv(target_file, names=colnames, header=None)

    contents['rating'] = contents['rating'].astype(int)

    train_data, test_data = train_test_split(contents, test_size=0.2)
    negative_rankings = train_data[train_data['rating'] < 3]
    neutral_documents = train_data[train_data['rating'] == 3]
    positive_rankings = train_data[train_data['rating'] > 3]

    return negative_rankings, neutral_documents, positive_rankings

def tokenize(docs, file_name):
    
    if os.path.isfile(file_name):
        os.remove(file_name)

    tokens = docs['text'].values
    tokens = ' '.join(tokens).split()
#     tokens = set(tokens)
    tokens = [str(x.lower()) for x in tokens if x not in ENGLISH_STOP_WORDS]

    with open(file_name, 'w') as the_file:
        the_file.write('\n'.join(tokens))
        
def main():
    convert_file()
    

if __name__ == "__main__":
    main()

# Breaking by Rankings

In [24]:
negative_docs, neutral_docs, positive_docs = divide_by_rankings()

# Tokenize

In [25]:
tokenize(negative_docs, 'negative.txt')
tokenize(neutral_docs, 'neutral.txt')
tokenize(positive_docs, 'positive.txt')

# Creating Vocab List

In [27]:
mega_doc_pos = open(os.getcwd() + '/positive.txt', 'r').read()
mega_doc_neg = open(os.getcwd() + '/negative.txt', 'r').read()
mega_doc_neu = open(os.getcwd() + '/neutral.txt', 'r').read()

total_words_in_pos = mega_doc_pos.split('\n')
total_words_in_neg = mega_doc_neg.split('\n')

vocab_pos = list(set(total_words_in_pos))
vocab_neg = list(set(total_words_in_neg))

positive_class_probability = 0.5
negaitive_class_probability = 0.5


# Classification

In [45]:
def classify(sentence):
    
    sentence_probab_pos = {}
    
    for x in sentence.split(' '):
        count_of_word_in_vocab = total_words_in_pos.count(x)
        sentence_probab_pos[x] = float(((count_of_word_in_vocab) + 1)) / (len(total_words_in_pos) + len(vocab_pos))

    sentence_probab_neg = {}

    for x in sentence.split(' '):
        count_of_word_in_vocab = total_words_in_neg.count(x)
        sentence_probab_neg[x] = float(((count_of_word_in_vocab) + 1)) / (len(total_words_in_neg) + len(vocab_neg))

    positive = positive_class_probability * reduce(lambda x, y: x*y, sentence_probab_pos.values())
    negative = negaitive_class_probability * reduce(lambda x, y: x*y, sentence_probab_neg.values())
    
    return int(positive > negative)

In [48]:
sentence = "All about horrible schools and friends"
review = {0: "Positive", 1: "Negative"}
label = classify(sentence)
print review.get(label)

Positive
