In [17]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("stopwords")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oliviali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/oliviali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oliviali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from collections import Counter
from scipy.sparse import dok_matrix
from sklearn.linear_model import LogisticRegression

class SentimentClassifier:
    def __init__(self, feature_method, min_feature_ct=1, L2_reg=1.0):
        """
        :param feature_method: featurize function
        :param min_feature_count: int, ignore the features that appear less than this number to avoid overfitting
        """
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_ct = min_feature_ct
        self.L2_reg = L2_reg

    def featurize(self, X):
        """
        # Featurize input text

        :param X: list of texts
        :return: list of featurized vectors
        """
        featurized_data = []
        for text in X:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    def pipeline(self, X, training=False):
        """
        Data processing pipeline to translate raw data input into sparse vectors
        :param X: featurized input
        :return: 2d sparse vectors
        
        Implement the pipeline method that translate the dictionary like feature vectors into homogeneous numerical
        vectors, for example:
        [{"fea1": 1, "fea2": 2}, 
         {"fea2": 2, "fea3": 3}] 
         --> 
         [[1, 2, 0], 
          [0, 2, 3]]
          
        Hints:
        1. How can you know the length of the feature vector?
        2. When should you use sparse matrix?
        3. Have you treated non-seen features properly?
        4. Should you treat training and testing data differently?
        """
        # Have to build feature_vocab during training
        if training:
            for dic in X:
                for key,value in dic.items():
                    if key in self.feature_vocab:
                        self.feature_vocab[key] += value
                    else:
                        self.feature_vocab[key] = value
            
            pos = ["non_seen"] + [key for key,value in self.feature_vocab.items() if value >= self.min_feature_ct]
            
            self.feature_vocab = {"non_seen":0}
            for idx,value in enumerate(pos):
                self.feature_vocab[value] = idx
            
            
            
        # Translate raw texts into vectors
        n_feature_vector = len(self.feature_vocab)
        n_sample = len(X)
        sp_matrix = dok_matrix((n_sample,n_feature_vector),dtype = "int64")

        for i in range(n_sample):
            for key in X[i].keys():
                if key in self.feature_vocab:
                    sp_matrix[i,self.feature_vocab[key]] = X[i][key]
                else:
                    sp_matrix[i,self.feature_vocab["non_seen"]] = X[i][key]

        return sp_matrix
        
    def fit(self, X, y):
        X = self.pipeline(self.featurize(X), training=True)

        D, F = X.shape
        self.model = LogisticRegression(C=self.L2_reg)
        self.model.fit(X, y)

        return self

    def predict(self, X):
        X = self.pipeline(self.featurize(X))
        return self.model.predict(X)

    def score(self, X, y):
        X = self.pipeline(self.featurize(X))
        return self.model.score(X, y)

    # Write learned parameters to file
    def save_weights(self, filename='weights.csv'):
        weights = [["__intercept__", self.model.intercept_[0]]]
        
        for feat, idx in self.feature_vocab.items():
            weights.append([feat, self.model.coef_[0][idx]])
        
        weights = pd.DataFrame(weights)
        weights.to_csv(filename, header=False, index=False)
        
        return weights

In [19]:
# Read data from file
from sklearn.utils import Bunch
import numpy as np

def load_data(filename):
    """
    Load input data and return sklearn.utils.Bunch 
    """
    target, text = [], []
    with open(filename, encoding="utf8", errors="ignore") as file:
        for line in file:
            cols = line.split("@")
            text.append(cols[0].rstrip()) 
            
            if cols[1] == 'positive\n':
                y_label = 1
            elif (cols[1] == 'negative\n') | (cols[1] == 'negative'):
                y_label = -1
            elif cols[1] == 'neutral\n':
                y_label = 0
            else:
                raise('Error')
            target.append(y_label)

    return Bunch(text=text, target=np.array(target))
def save_prediction(arr, filename="prediction.csv"):
    """
    Save the prediction into file
    """
    out = open(filename, "w", encoding="utf8")
    for idx, val in enumerate(arr):
        if val == 1:
            pred = 'positive'
        elif val == -1:
            pred = 'negative'
        
        out.write("%s,%s\n" % (idx, pred))
    out.close()

In [20]:
data = load_data("/Users/oliviali/Downloads/Sentences_66Agree.txt")
X, y = data.text, data.target
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3)

In [21]:
def bag_of_words_featurize(text):
    """
    !! Do not work on this yet, work on the model and come back later !!
    
    Write your own code below
    """
    from nltk.tokenize import word_tokenize
       
    
    porter = nltk.PorterStemmer()

    
    from nltk.corpus import stopwords 
    stop_words = set(stopwords.words('english')) 
  
    ## Bag of words
    feats = {}
    words = word_tokenize(text)
    
    
    words = [porter.stem(x) for x in words]
    
    words = [x for x in words if not x in stop_words] 
    
        
    for k in enumerate(set(words)):
        
        if k not in feats:
            feats[k] = 1
        else:
            feats[k] += 1
            
            
    return feats 

In [22]:
#test bag of words
cls = SentimentClassifier(feature_method=bag_of_words_featurize, min_feature_ct = 10)
cls = cls.fit(X_train, y_train)

print("Training set accuracy: ", cls.score(X_train, y_train))
print("Dev set accuracy: ", cls.score(X_dev, y_dev))

Training set accuracy:  0.7516096238563199
Dev set accuracy:  0.6595576619273301


In [23]:
def POS_featurize(text):
    """
    !! Do not work on this yet, work on the model and come back later !!
    
    Write your own code below
    
    """

    
    from nltk.tokenize import word_tokenize

    wnl = nltk.WordNetLemmatizer()
    
    from nltk.corpus import stopwords 
    stop_words = set(stopwords.words('english')) 
  

    feats = {}
    words = word_tokenize(text)
    
    words = [wnl.lemmatize(x) for x in words]
    
    words = [x for x in words if not x in stop_words] 
    
    default_tagger = nltk.DefaultTagger('NN')
    
    words = default_tagger.tag(words)

    feats = {}   
    for k in enumerate(set(words)):
        
        if k not in feats:
            feats[k] = 1
        else:
            feats[k] += 1
           
            
    return feats

In [24]:
#test POS
cls = SentimentClassifier(feature_method=POS_featurize, min_feature_ct = 10, L2_reg = 0.1)
cls = cls.fit(X_train, y_train)
print("Training set accuracy: ", cls.score(X_train, y_train))
print("Dev set accuracy: ", cls.score(X_dev, y_dev))

Training set accuracy:  0.6699423924093527
Dev set accuracy:  0.665086887835703
