### Sentiment Analysis of Movie Reviews
Mayank Grover (mg5229)  
Shishir Singapura Lakshminarayan (ssl495)

In [0]:
import numpy as np
import os
import pandas as pd
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [0]:
def read_data(file_dir):
    
    with open(file_dir) as file:

        data = []
        labels = []

        for line in file:
            labels.append(int(line[0]))
            comment = set(line[2:].strip('\n\t').split(' '))
            data.append(comment)
    
    return data, labels


def jaccard_distance_matrix(train_data, test_data):
    
    similarity = np.zeros((len(test_data), len(train_data)))
    for test_index in range(len(test_data)):
        for train_index in range(len(train_data)):
            
            similar_words = \
            train_data[train_index].intersection(test_data[test_index])
            
            all_words = \
            train_data[train_index].union(test_data[test_index])
            
            similarity[test_index, train_index] = \
            len(similar_words) / len(all_words)
            
    distance = 1.0/similarity
    
    return distance
    
    
def intersection_distance_matrix(train_data, test_data):

    similarity = np.zeros((len(test_data), len(train_data)))
    for test_index in range(len(test_data)):
        for train_index in range(len(train_data)):
            
            similar_words = \
            train_data[train_index].intersection(test_data[test_index])
            
            similarity[test_index, train_index] = len(similar_words)
            
    distance = 1.0/similarity
    
    return distance


def distinct_indices(dist_series, k):
    
    k_now = 0
    for index, value in dist_series.iteritems():
        if k_now == 0:
            indices = [index]
            prev_value = value
            k_now += 1
            continue
        if k_now < k:
            indices.append(index)
            prev_value = value
            k_now += 1
        elif k_now >= k:
            if prev_value == value:
                indices.append(index)
            else:
                break
    return indices


def predict(train_labels, distance, k):

    predictions = [None] * distance.shape[0]

    for i in range(distance.shape[0]):
        
        dist_series = pd.Series(distance[i]).sort_values()
        indices = distinct_indices(dist_series, k)
        zeros = 0
        ones = 0
        for index in indices:
            if train_labels[index] == 0:
                zeros += 1
            else:
                ones +=1
        predictions[i] = 0 if zeros > ones else 1

    return predictions

In [0]:
def kNN(train_data, test_data, train_labels, k, distance_measure):
    '''
    valid inputs:
    distance_measure = jaccard / intersection
    '''   


    if distance_measure == 'jaccard':
        distance = jaccard_distance_matrix(train_data, test_data)
    elif distance_measure == 'intersection':
        distance = intersection_distance_matrix(train_data, test_data)
    else:
        print('Error: Invalid Distance metric')
    
    predictions = predict(train_labels, distance, k)
    
    return predictions

In [0]:
train_dir = './reviewstrain.txt'
test_dir = './reviewstest.txt'

In [0]:
# Read the train and test files
all_train_data, all_train_labels = read_data(train_dir)
all_test_data, all_test_labels = read_data(test_dir)

In [6]:
# Out of Sample - Without cross-validation

# Running kNN without cross-fold validations
predictions_1 = kNN(all_train_data, all_test_data, all_train_labels, k=1, 
                    distance_measure='intersection')
predictions_3 = kNN(all_train_data, all_test_data, all_train_labels, k=3, 
                    distance_measure='intersection')
predictions_5 = kNN(all_train_data, all_test_data, all_train_labels, k=5, 
                    distance_measure='intersection')


# Calculating accuracy
accuracy_1 = accuracy_score(all_test_labels, predictions_1)
accuracy_3 = accuracy_score(all_test_labels, predictions_3)
accuracy_5 = accuracy_score(all_test_labels, predictions_5)


# Calculating Confusion Matrix
confusion_1 = confusion_matrix(all_test_labels, predictions_1)
confusion_3 = confusion_matrix(all_test_labels, predictions_3)
confusion_5 = confusion_matrix(all_test_labels, predictions_5)

print(accuracy_1, accuracy_3, accuracy_5)
print(confusion_1)
print(confusion_3)
print(confusion_5)



0.604 0.59 0.606
[[ 93 134]
 [ 64 209]]
[[ 83 144]
 [ 61 212]]
[[ 91 136]
 [ 61 212]]


In [12]:
print('Predicted label for example 18 (k=1) is :', predictions_1[17])

Predicted label for example 18 (k=1) is : 1


In [8]:
# Zero R


predictions_zeroR = [1 if all_test_labels.count(1) >= all_test_labels.count(0)
                     else 0] * len(all_test_labels)
accuracy_zeroR = accuracy_score(all_test_labels, predictions_zeroR)
confusion_zeroR = confusion_matrix(all_test_labels, predictions_zeroR)

print(accuracy_zeroR)
print(confusion_zeroR)

0.546
[[  0 227]
 [  0 273]]


In [0]:
def cross_validation_kNN(train_data, train_labels, k, n_splits=5):

    train_len = len(train_data)
    train_data_splits = np.array_split(train_data, n_splits)
    train_label_splits = np.array_split(train_labels, n_splits)
    predicted_labels = np.array(())

    for i in range(n_splits) :

        test_set = train_data_splits[i]
        train_set = np.delete(train_data_splits, i, axis=0).flatten()
        train_label_set = np.delete(train_label_splits, i, axis=0).flatten()

        predictions = kNN(train_set, test_set, train_label_set, k, 
                          distance_measure='intersection')

        predicted_labels = np.append(predicted_labels, predictions)
    
    return predicted_labels

In [10]:
# Cross-validation


# Predictions
predicted_cv_labels_3 = cross_validation_kNN(all_train_data, 
                                             all_train_labels, 3)
predicted_cv_labels_7 = cross_validation_kNN(all_train_data, 
                                             all_train_labels, 7)
predicted_cv_labels_99 = cross_validation_kNN(all_train_data, 
                                              all_train_labels, 99)

# Accuracy
accuracy_cv_3 = accuracy_score(all_train_labels, predicted_cv_labels_3)
accuracy_cv_7 = accuracy_score(all_train_labels, predicted_cv_labels_7)
accuracy_cv_99 = accuracy_score(all_train_labels, predicted_cv_labels_99)

# Calculating Confusion Matrix
confusion_cv_3 = confusion_matrix(all_train_labels, predicted_cv_labels_3)
confusion_cv_7 = confusion_matrix(all_train_labels, predicted_cv_labels_7)
confusion_cv_99 = confusion_matrix(all_train_labels, predicted_cv_labels_99)

print(accuracy_cv_3, accuracy_cv_7, accuracy_cv_99)
print(confusion_cv_3)
print(confusion_cv_7)
print(confusion_cv_99)



0.66 0.658 0.612
[[330 359]
 [151 660]]
[[308 381]
 [132 679]]
[[186 503]
 [ 79 732]]


In [11]:
# Out of Sample - Without cross-validation - JACCARD DISTANCE

# Running kNN without cross-fold validations
predictions_1 = kNN(all_train_data, all_test_data, all_train_labels, k=1, 
                    distance_measure='jaccard')
predictions_3 = kNN(all_train_data, all_test_data, all_train_labels, k=3, 
                    distance_measure='jaccard')
predictions_5 = kNN(all_train_data, all_test_data, all_train_labels, k=5, 
                    distance_measure='jaccard')


# Calculating accuracy
accuracy_1 = accuracy_score(all_test_labels, predictions_1)
accuracy_3 = accuracy_score(all_test_labels, predictions_3)
accuracy_5 = accuracy_score(all_test_labels, predictions_5)


# Calculating Confusion Matrix
confusion_1 = confusion_matrix(all_test_labels, predictions_1)
confusion_3 = confusion_matrix(all_test_labels, predictions_3)
confusion_5 = confusion_matrix(all_test_labels, predictions_5)

print(accuracy_1, accuracy_3, accuracy_5)
print(confusion_1)
print(confusion_3)
print(confusion_5)



0.642 0.63 0.668
[[115 112]
 [ 67 206]]
[[109 118]
 [ 67 206]]
[[114 113]
 [ 53 220]]
