In [1]:
%matplotlib inline
import string
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)

In [13]:
with open("./sentiment-logistic-regression/sentiment_labelled_sentences/full_set.txt",
         encoding='utf-8') as f:
    content = f.readlines()

content = [x.strip() for x in content]

sentences = [x.split("\t")[0] for x in content]
labels = [x.split("\t")[1] for x in content]

y = np.array(labels, dtype='int8')
y = 2*y - 1

In [14]:
## Preprocessing the text data

In [15]:
def full_remove(x, removal_list):
    for w in removal_list:
        x = x.replace(w, ' ')
    return x

In [16]:
digits = [str(x) for x in range(10)]

# remove digits
digit_less = [full_remove(x, digits) for x in sentences]
# remove punctation
punc_less = [full_remove(x, list(string.punctuation)) for x in digit_less]
# make everything lowercase
sents_lower = [x.lower() for x in punc_less]

In [17]:
sents_lower[0:2]

['so there is no way for me to plug it in here in the us unless i go by a converter ',
 'good case  excellent value ']

In [24]:
with open("./stopwords") as f:
    stop_word = f.readlines()
stop_word = [x.strip() for x in stop_word]  
len(stop_word)

186

In [28]:
sents_split = [x.split() for x in sents_lower]
sents_processed =[" ".join(list(filter(lambda a: a not in stop_word, x))) for x in sents_split]
sents_processed[0:10]

['way plug us unless go converter',
 'good case excellent value',
 'great jawbone',
 'tied charger conversations lasting minutes major problems',
 'mic great',
 'jiggle plug line right decent volume',
 'several dozen several hundred contacts imagine fun sending one one',
 'razr owner must',
 'needless say wasted money',
 'waste money time']

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None,
                            max_features=4500)
data_features = vectorizer.fit_transform(sents_processed)
p_mat = data_features.toarray()
data_mat = np.ones((p_mat.shape[0],p_mat.shape[1]+1))
data_mat[:,:-1] = p_mat
print(data_mat.shape)
print(data_mat[0:3, 4499:])

(3000, 4501)
[[0. 1.]
 [0. 1.]
 [0. 1.]]


In [51]:
# Training/test split
np.random.seed(0)
test_size = 500
each_test_size = int(test_size/2)

test_inds = np.append(np.random.choice((np.where(y==-1))[0],250,replace=False), 
                    np.random.choice((np.where(y==1))[0],250,replace=False))
train_inds = list(set(range(len(labels))) - set(test_inds))

train_data = data_mat[train_inds,]
train_labels = y[train_inds]

test_data = data_mat[test_inds,]
test_labels = y[test_inds]

print("train data:", train_data.shape)
print("test data:", test_data.shape)

train data: (2500, 4501)
test data: (500, 4501)


In [56]:
## fit a logistic regression model
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="log", penalty="none")
clf.fit(train_data, train_labels)

w = clf.coef_[0,:]
b = clf.intercept_

preds_train = clf.predict(train_data)
preds_test = clf.predict(test_data)

errs_train = np.sum((preds_train > 0.0) != (train_labels > 0.0))
errs_test = np.sum((preds_test > 0.0) != (test_labels > 0.0))

print("Training error: ", float(errs_train)/len(train_labels))
print("Testing error: ", float(errs_test)/len(test_labels))

Training error:  0.0096
Testing error:  0.192
