# Sentiment analysis (Exercise 4)

In [1]:
__author__ = "Xin Guan"
__version__ = "DSGA 1012, NYU, Spring 2018 term"

## Setup

load the Stanford Sentiment Treebank. download it from here: [the train/dev/test Stanford Sentiment Treebank distribution](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)

In [2]:
import re
import random
import os
import numpy as np
import collections

In [3]:
sst_home = 'trees'

def load_sst_data(path):
    # do 2-way positive/negative classification instead of 5-way
    EASY_LABEL_MAP = {0:0, 1:0, 2:None, 3:1, 4:1}
    
    data = []
    with open(path) as f:
        for i, line in enumerate(f): 
            example = {}
            example['label'] = EASY_LABEL_MAP[int(line[1])]
            if example['label'] is None:
                continue
            
            text = re.sub(r'\s*(\(\d)|(\))\s*', '', line)
            example['text'] = text[1:]
            data.append(example)

    return data
     
training_set = load_sst_data(sst_home + '/train.txt')
dev_set = load_sst_data(sst_home + '/dev.txt')
#test_set = load_sst_data(sst_home + '/test.txt')

using IMDb movie reviews as a test set later on. Download the data from "http://ai.stanford.edu/~amaas/data/sentiment/"

reformats the data in the same form as  SST data.

In [4]:
imdb_home = 'aclImdb/test/'

def load_imdb_data(path):
    
    pos_data, neg_data = [], []
    all_files = []
    _limit = 250
    
    for dirpath, dirnames, files in os.walk(path):
        for name in files:
            all_files.append(os.path.join(dirpath, name))
            
            
    for file_path in all_files:
        if '/neg' in file_path and len(neg_data) <= _limit:
            example = {}
            with open(file_path, 'r') as myfile:
                example['text'] = myfile.read().replace('\n', '')
            example['label'] = 0
            neg_data.append(example)
            
        if '/pos' in file_path and len(pos_data) <= _limit:
            example = {}
            with open(file_path, 'r') as myfile:
                example['text'] = myfile.read().replace('\n', '')
            example['label'] = 1
            pos_data.append(example)
    data = neg_data + pos_data

    return data

            
imdb_test = load_imdb_data(imdb_home)

In [5]:
imdb_test[0]

{'label': 0,
 'text': "Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook."}

Next, we build a function `feature_function()` that annotates datasets with feature vectors.

In [6]:
def feature_function(datasets):
    '''Annotates datasets with feature vectors.'''
                         
    # Extract vocabulary
    def tokenize(string):
        return string.split()
    
    word_counter = collections.Counter()
    for example in datasets[0]:
        word_counter.update(tokenize(example['text']))
    
    vocabulary = set([word for word in word_counter])

    feature_names = set()
    for i, dataset in enumerate(datasets):
        for example in dataset:
            example['features'] = collections.defaultdict(float)
        
            
            #Extract features (by name) for one example:
            word_counter = collections.Counter(tokenize(example['text']))
            for x in word_counter.items():
                if x[0] in vocabulary:
                    example["features"]["word_count_for_" + x[0]] = min(x[1], 1)
                    
#            '''
#                Adding Negation feature
#            '''        
#             if "n't" in example['text'] or "not" in example['text']:
#                 example["features"]["negation"] = 1
#             else:
#                 example["features"]["negation"] = 0
            
            feature_names.update(example['features'].keys())
                            
    # assign indices to them.
    feature_indices = dict(zip(feature_names, range(len(feature_names))))
    indices_to_features = {v: k for k, v in feature_indices.items()}
    dim = len(feature_indices)
                
    #  create actual vectors from those indices.
    for dataset in datasets:
        for example in dataset:
            example['vector'] = np.zeros((dim))
            for feature in example['features']:
                example['vector'][feature_indices[feature]] = example['features'][feature]
    return indices_to_features
    
indices_to_features = feature_function([training_set, dev_set, imdb_test])

In [7]:
indices_to_features

{0: 'word_count_for_fall',
 1: 'word_count_for_proceeds',
 2: 'word_count_for_fussing',
 3: 'word_count_for_sneering',
 4: 'word_count_for_knickknacks',
 5: 'word_count_for_overstays',
 6: 'word_count_for_ultra-provincial',
 7: 'word_count_for_Literary',
 8: 'word_count_for_unsatisfying',
 9: 'word_count_for_Judaism',
 10: 'word_count_for_downfall',
 11: 'word_count_for_completely',
 12: 'word_count_for_learning',
 13: 'word_count_for_cracker',
 14: 'word_count_for_annoying',
 15: 'word_count_for_flair',
 16: 'word_count_for_add',
 17: 'word_count_for_Trains',
 18: 'word_count_for_visible',
 19: 'word_count_for_able',
 20: 'word_count_for_shots',
 21: 'word_count_for_downs',
 22: 'word_count_for_warmest',
 23: 'word_count_for_abuse',
 24: 'word_count_for_Jewish',
 25: 'word_count_for_cheapen',
 26: 'word_count_for_rediscover',
 27: 'word_count_for_disapproval',
 28: 'word_count_for_gamut',
 29: 'word_count_for_shriveled',
 30: 'word_count_for_punishable',
 31: 'word_count_for_Your',
 3

## A linear classifier: Logistic Regression

use the LogisticRegression class from Scikit-learn to build the classifier for this dataset


In [8]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

In [9]:
X_train = [x['vector'] for x in training_set]
y_train = [x['label'] for x in training_set]
log_model = log_model.fit(X=X_train, y=y_train)

In [10]:
X_dev = [x['vector'] for x in dev_set]
y_dev = [x['label'] for x in dev_set]

y_dev_pred = log_model.predict(X_dev)

## Accuracy

In [11]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_dev_pred, y_dev))

0.7798165137614679


In [12]:
x_test = [x['vector'] for x in imdb_test]
y_test = [x['label'] for x in imdb_test]

y_test_pred = log_model.predict(x_test)

print(accuracy_score(y_test_pred, y_test))

0.7729083665338645
