In [174]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import os
import re
from collections import defaultdict

In [175]:
train_data = pd.read_csv('advanced_trainset.csv')
train_data['Encode'] = train_data.Sentiment.apply(lambda x: 1 if x == 'positive' else -1 if x == 'negative' else 0)
train_data.head()

Unnamed: 0,Sentence,Sentiment,Encode
0,According to the Finnish-Russian Chamber of Co...,neutral,0
1,The Swedish buyout firm has sold its remaining...,neutral,0
2,$SPY wouldn't be surprised to see a green close,positive,1
3,Shell's $70 Billion BG Deal Meets Shareholder ...,negative,-1
4,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative,-1


In [176]:
sentiment_counts = train_data['Sentiment'].value_counts().reset_index()
sentiment_counts

Unnamed: 0,index,Sentiment
0,neutral,2363
1,positive,1383
2,negative,636


In [177]:
import spacy
from spacy import displacy
nlp = spacy.load('en')

def subject(sentence):
    doc = nlp(sentence)
    subject_toks = [tok for tok in doc if (tok.dep_ == 'nsubj' or tok.pos_ == 'propn')]
    return subject_toks

In [178]:
train_data['Subject'] = train_data['Sentence'].apply(subject)

In [179]:
train_data

Unnamed: 0,Sentence,Sentiment,Encode,Subject
0,According to the Finnish-Russian Chamber of Co...,neutral,0,[companies]
1,The Swedish buyout firm has sold its remaining...,neutral,0,[Swedish]
2,$SPY wouldn't be surprised to see a green close,positive,1,[SPY]
3,Shell's $70 Billion BG Deal Meets Shareholder ...,negative,-1,[]
4,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative,-1,"[Company, results]"
...,...,...,...,...
4377,Investments in product development stood at 6....,neutral,0,[Investments]
4378,HSBC Says Unit to Book $585 Million Charge on ...,negative,-1,[HSBC]
4379,RISING costs have forced packaging producer Hu...,negative,-1,[costs]
4380,"In the building and home improvement trade , s...",neutral,0,[sales]


In [180]:
import string
nltk.download('stopwords')
nltk.download('punkt')

stops = set(stopwords.words('english'))
# special cases
stops.add("'s")
stops.add("``")
stops.add("--")

stops.remove('not')
stops.remove('against')
from nltk.tokenize import word_tokenize
from string import punctuation

def clean(sentence):
    # delete stopwords
    temp = ' '.join(filter(lambda x: x not in stops, sentence.split()))
    # Remove punctuation
    temp = temp.translate(str.maketrans('', '', punctuation))
    # Delete short words with 3 or less characters
    # temp = (lambda x: ' '.join([w for w in x.split() if len(w) > 3]))(temp)
    # Change all to lower case
    temp = temp.lower()
    # Delete numbers
    temp = re.sub(r'[0-9]', '', temp)
    # Delete excessive spaces and return
    return re.sub('. ', ' ', temp)

def remove_punctuation(text):
    result = [w for w in text if w not in string.punctuation.replace('-', '')]
    return ''.join(result)

train_data['Sentence'] = train_data['Sentence'].apply(remove_punctuation)

def remove_stopwords(text):
    stopword = nltk.corpus.stopwords.words('english')
    stopword.remove('not')
    result = [w for w in nltk.word_tokenize(text) if w not in stopword]
    return ' '.join(result)

train_data['Sentence'] = train_data['Sentence'].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [209]:
train_data

Unnamed: 0,Sentence,Sentiment,Encode,Subject
0,accordin finnishrussia chambe commerc majo con...,neutral,0,[companies]
1,th swedis buyou fir sol remainin percen stak ...,neutral,0,[Swedish]
2,sp wouldn surprise se gree close,positive,1,[SPY]
3,shell billio b dea meet shareholde skepticism,negative,-1,[]
4,ss communication securit cor stoc exchang rele...,negative,-1,"[Company, results]"
...,...,...,...,...
4377,investment produc developmen stoo ml eur mln,neutral,0,[Investments]
4378,hsb say uni boo millio charg settlement,negative,-1,[HSBC]
4379,risin cost force packagin produce huhtamak ax ...,negative,-1,[costs]
4380,i buildin hom improvemen trad sale decrease e...,neutral,0,[sales]


In [215]:
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(max_features = 2000, ngram_range = (1, 2), norm = 'l2')
vector.fit(train_data['Sentence'])
vector_X = vector.transform(train_data['Sentence'])

In [217]:
tfidf_matrix = vector.fit_transform(train_data['Sentence'])
tfidf_tokens = vector.get_feature_names()
df_tfidf = pd.DataFrame(data = tfidf_matrix.toarray(), columns = tfidf_tokens)
print(df_tfidf)

      aap   ab  abl  abou  acces  accordanc  accordin  accordin finnis  \
0     0.0  0.0  0.0   0.0    0.0        0.0  0.371839              0.0   
1     0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
2     0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
3     0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
4     0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
...   ...  ...  ...   ...    ...        ...       ...              ...   
4377  0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
4378  0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
4379  0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
4380  0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   
4381  0.0  0.0  0.0   0.0    0.0        0.0  0.000000              0.0   

      account  accounte  ...  yearonyea eu  yearonyear  years  yesterda  yho  \
0         0.0       0.0  ...   


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



In [222]:
print(np.nonzero(tfidf_matrix))
print(tfidf_matrix[np.nonzero(tfidf_matrix)])
print(np.mean(tfidf_matrix[np.nonzero(tfidf_matrix)]))

(array([   0,    0,    0, ..., 4381, 4381, 4381], dtype=int32), array([1523, 1195,  636, ...,  688,  980,  774], dtype=int32))
[[0.41316714 0.29094031 0.32019275 ... 0.25251404 0.27613971 0.18930853]]
0.3018120516106185


In [213]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C = 10, solver = 'lbfgs', max_iter = 10000, random_state = 1)
clf = model.fit(vector_X, train_data['Sentiment'])
clf.score(vector_X, train_data['Sentiment'])

0.9290278411684163

In [214]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(vector_X, train_data['Sentiment'], test_size = 0.3, random_state = 1)
clf = model.fit(X_train, y_train)
predicted = clf.predict(X_test)
print('Logistic Regression Accuracy: ', metrics.accuracy_score(y_test, predicted))

Logistic Regression Accuracy:  0.6494296577946768


In [202]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
clf = model.fit(vector_X, train_data['Sentiment'])
clf.score(vector_X, train_data['Sentiment'])

0.5460976722957553

In [203]:
X_train, X_test, y_train, y_test = train_test_split(vector_X, train_data['Sentiment'], test_size = 0.3, random_state = 1)
clf = model.fit(X_train, y_train)
predicted = clf.predict(X_test)
print('Naive Bayes Accuracy: ', metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy:  0.5300380228136882
