In [1]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
from scipy.stats import norm

from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /Users/t.k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def load_data(filename):
    X = []
    Y = []
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[2].lstrip().rstrip()
            text = cols[3]

            X.append(text)
            Y.append(label)

    return X, Y

In [14]:
class Classifier:

    def __init__(self, feature_method, trainX, trainY, devX, devY, testX, testY):
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_reg = None

        self.trainY=trainY
        self.devY=devY
        self.testY=testY
        
        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):
        
        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    # Train model and evaluate on held-out data
    def train(self):
        (D,F) = self.trainX.shape
        best_dev_accuracy=0
        best_model=None
        for C in [0.1, 1, 10, 100]:
            self.log_reg = linear_model.LogisticRegression(C = C, class_weight='balanced', max_iter=1000)
            self.log_reg.fit(self.trainX, self.trainY)
            training_accuracy = self.log_reg.score(self.trainX, self.trainY)
            development_accuracy = self.log_reg.score(self.devX, self.devY)
            if development_accuracy > best_dev_accuracy:
                best_dev_accuracy=development_accuracy
                best_model=self.log_reg

            print("C: %s, Train accuracy: %.3f, Dev accuracy: %.3f" % (C, training_accuracy, development_accuracy))
            
        self.log_reg=best_model
        
        # save prediction
        self.train_pred = self.log_reg.predict(self.trainX)
        self.dev_pred = self.log_reg.predict(self.devX)
        

    def test(self):
        
        # save prediction
        self.test_pred = self.log_reg.predict(self.testX)
        
        return self.log_reg.score(self.testX, self.testY)
        

    def printWeights(self, n=10):

        reverse_vocab=[None]*len(self.log_reg.coef_[0])
        for k in self.feature_vocab:
            reverse_vocab[self.feature_vocab[k]]=k

        # binary
        if len(self.log_reg.classes_) == 2:
              weights=self.log_reg.coef_[0]

              cat=self.log_reg.classes_[1]
              for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

              cat=self.log_reg.classes_[0]
              for feature, weight in list(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1)))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

        # multiclass
        else:
          for i, cat in enumerate(self.log_reg.classes_):

              weights=self.log_reg.coef_[i]

              for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
                  print("%s\t%.3f\t%s" % (cat, weight, feature))
              print()

    # Plot confusion matrix on test data
    def plot_confusion_matrix(self):
        cm = confusion_matrix(self.testY, self.test_pred, normalize="all")
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=self.log_reg.classes_)
        disp.plot()
        
    # Print random samples of TN, FP, FN, TP
    def print_confusion_matrix_examples(self, n=5):
        print("Implementing")

In [7]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [11]:
def run(trainingFile, devFile, testFile, feature_method):
    trainX, trainY=load_data(trainingFile)
    devX, devY=load_data(devFile)
    testX, testY=load_data(testFile)
    
    simple_classifier = Classifier(feature_method, trainX, trainY, devX, devY, testX, testY)
    simple_classifier.train()
    accuracy=simple_classifier.test()
    
    lower, upper=confidence_intervals(accuracy, len(testY), .95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

    simple_classifier.printWeights()
    
    return simple_classifier

In [12]:
def binary_bow_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        feats[word]=1
            
    return feats

In [None]:
trainingFile = "train.txt"
devFile = "dev.txt"
testFile = "test.txt"
    
model = run(trainingFile, devFile, testFile, binary_bow_featurize)

C: 0.1, Train accuracy: 0.960, Dev accuracy: 0.790
C: 1, Train accuracy: 1.000, Dev accuracy: 0.720
C: 10, Train accuracy: 1.000, Dev accuracy: 0.730
C: 100, Train accuracy: 1.000, Dev accuracy: 0.720
Test accuracy for best dev model: 0.750, 95% CIs: [0.665 0.835]

pos	0.454	3
pos	0.313	30
pos	0.301	4
pos	0.270	it
pos	0.266	glass
pos	0.257	cut
pos	0.251	preheat
pos	0.250	around
pos	0.249	if
pos	0.248	with

neg	-0.285	season
neg	-0.262	warm
neg	-0.261	powder
neg	-0.242	vinegar
neg	-0.221	soda
neg	-0.216	more
neg	-0.206	garlic
neg	-0.201	hour
neg	-0.191	parmesan
neg	-0.187	bell



In [16]:
# preprocessing 
# 1. lower
# 2. remove stop words
# 3. convert number to same value

def custom_featurize(text):
    stop_words = set(stopwords.words('english'))
    feats = {}
    
    # word_length
    feats['num_tokens'] = len(text.split())
    
    # tokenize
    words = nltk.word_tokenize(text)
            
    for word in words:
        # remove stopwords
        if word in stop_words:
            continue
        
        # convert to lowercase
        word = word.lower()

        # convert numbers to 'num_symbol'
        word = 'num_symbol' if word.isnumeric() else word
            
        feats[word] = 1
    
    return feats

In [18]:
model_preprocessed = run(trainingFile, devFile, testFile, custom_featurize)

C: 0.1, Train accuracy: 0.933, Dev accuracy: 0.760
C: 1, Train accuracy: 1.000, Dev accuracy: 0.780
C: 10, Train accuracy: 1.000, Dev accuracy: 0.770
C: 100, Train accuracy: 1.000, Dev accuracy: 0.770
Test accuracy for best dev model: 0.790, 95% CIs: [0.710 0.870]

pos	0.603	around
pos	0.591	tablespoons
pos	0.587	endive
pos	0.579	toast
pos	0.575	3/4
pos	0.556	make
pos	0.550	broth
pos	0.541	mustard
pos	0.523	still
pos	0.518	glass

neg	-0.808	season
neg	-0.708	warm
neg	-0.679	soda
neg	-0.671	powder
neg	-0.634	hour
neg	-0.596	parmesan
neg	-0.557	garlic
neg	-0.553	450°f
neg	-0.549	bell
neg	-0.540	often

