In [1]:
#!pip install scikit-learn

In [2]:
#!pip install -U spacy

In [3]:
#!python -m spacy download en

In [4]:
#!python -m spacy download en_core_web_sm

In [5]:
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle

In [6]:
nlp = spacy.load('en_core_web_sm')

In [7]:
text = "this is the first. this is the second. this is the third"

In [8]:
doc = nlp(text)

In [9]:
text

'this is the first. this is the second. this is the third'

In [10]:
for token in doc:
    print(token)

this
is
the
first
.
this
is
the
second
.
this
is
the
third


In [11]:
doc = nlp(text)

In [12]:
for sent in doc.sents:
    print(sent)

this is the first.
this is the second.
this is the third


In [13]:
stopwords = list(STOP_WORDS)

In [14]:
for token in doc:
    if token.is_stop == False:
        print(token)

.
second
.


In [15]:
doc = nlp("run running runs runner")

In [16]:
for lem in doc:
    print(lem.text, lem.lemma_)

run run
running run
runs run
runner runner


In [38]:
data_yelp = pd.read_csv('../trainingDataForSentiment/yelp_labelled.txt', sep='\t', header = None)
data_amazon = pd.read_csv('../trainingDataForSentiment/amazon_cells_labelled.txt', sep='\t', header = None)
data_imdb = pd.read_csv('../trainingDataForSentiment/imdb_labelled.txt', sep='\t', header = None)

reddit_test = pd.read_csv('../trainingDataForSentiment/reddit_test.txt', sep='\t', header = None)


In [18]:
columns_name = ['Review', 'Sentiment']
data_yelp.columns = columns_name
data_amazon.columns = columns_name
data_imdb.columns = columns_name
reddit_test.columns = columns_name

In [19]:
print(data_yelp.shape)
print(data_amazon.shape)
print(data_imdb.shape)
print(reddit_test.shape)

(1000, 2)
(1000, 2)
(748, 2)
(10, 2)


In [20]:
data = data_yelp.append([data_amazon, data_imdb], ignore_index=True)
data = shuffle(data)
print(data.head)

<bound method NDFrame.head of                                                  Review  Sentiment
2554            Meredith M was better than all right.            1
384   WILL NEVER EVER GO BACK AND HAVE TOLD MANY PEO...          0
1922                                     Happy so far!.          1
1809  I have a Verizon LG phone and they work well t...          1
2191  ) What makes this story different are the terr...          1
...                                                 ...        ...
2541       What on earth is Irons doing in this film?            0
2180  This totally UNfunny movie is so over the top ...          0
300   Good beer & drink selection and good food sele...          1
1197                                        Bad Choice.          0
752   Level 5 spicy was perfect, where spice didn't ...          1

[2748 rows x 2 columns]>


In [21]:
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [22]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [23]:
punct = string.punctuation 

In [24]:
 punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [25]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower()
        tokens.append(temp)
    cleaned_tokens = []    
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [26]:
text_data_cleaning("Hello how are you. Like this video")

['hello', 'like', 'video']

In [27]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC()

In [28]:
X_reddit = reddit_test['Review']
Y_reddit = reddit_test['Sentiment']

X = data['Review']
y = data['Sentiment']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [30]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [31]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7f9fbd9cfee0>)),
                ('clf', LinearSVC())])

In [32]:
y_pred = clf.predict(X_test)
type(reddit_test)
#type(X_test)

pandas.core.frame.DataFrame

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80       266
           1       0.82      0.80      0.81       284

    accuracy                           0.81       550
   macro avg       0.81      0.81      0.81       550
weighted avg       0.81      0.81      0.81       550



In [34]:
confusion_matrix(y_test, y_pred)

array([[216,  50],
       [ 56, 228]])

In [35]:
clf.predict(["What do you guys Think on NOK will it or will it not. I think it will.","I LIKE NOK pt 2.","Be careful. Air Canada filed for bankruptcy the last recession. New management was structured with the same company name, but the existing shareholders were wiped out.","Remember when hertz filed for bankruptcy and you missed out on all those gainz? This is your shot. ","Do not invest in the stock of a bankrupt company. Bankruptcy is an indication of insolvency, meaning that the fair value of the liabilities (both recorded and unrecorded) exceed the fair value of the assets. It is very rare (not never, but very rare) that there are sufficient funds to fully compensate the secured and unsecured creditors, nevermind equity.","How did you arrive at 20% clean energy ETFs? I personally wouldn't allocate more than 1- 2% of my portfolio of those ETFs if I was rebalancing.","Yes, the god of losing 1/3rd of your money in 3 months. Here's to all the poor saps that bought into ARKK at the highs.","RIP my portfolio of G and K","Roku was a steal today. People thought it would pull a Netflix despite ad spending (Goog/FB) and HBO growth hitting records.","yay penny stocks!"])

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 1])

In [39]:
def predictSentiment(textToPredict):
    test = clf.predict([textToPredict])
    return test[0]

In [37]:
print(predictSentiment("This is cool"))

1
