In [465]:
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
import string

In [466]:
# Loading Spacy small model as nlp
nlp = spacy.load("en_core_web_sm")

In [467]:
# Gathering all the stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


In [468]:
# Loading yelp dataset
data = pd.read_csv('./datasets/headlines_labelled.txt',
                        sep='\t', header= None)
data.head()

Unnamed: 0,0,1
0,UPDATE 1 Bitcoin slides more than 5 after topp...,0
1,UPDATE 1 Bitcoin trades near Sunday record of ...,1
2,Don t Use Telegram s New People Nearby Feature.,0
3,UPDATE 1 Bitcoin hits one week low as rising U...,0
4,Jack Dorsey criticized proposed cryptocurrency...,0


In [469]:
# Adding column names to the dataframe
columnName = ['Review','Sentiment']
data.columns = columnName
data.head()

Unnamed: 0,Review,Sentiment
0,UPDATE 1 Bitcoin slides more than 5 after topp...,0
1,UPDATE 1 Bitcoin trades near Sunday record of ...,1
2,Don t Use Telegram s New People Nearby Feature.,0
3,UPDATE 1 Bitcoin hits one week low as rising U...,0
4,Jack Dorsey criticized proposed cryptocurrency...,0


In [470]:
print(data.shape)

(278, 2)


In [471]:
# Sentiment ditribution in the dataset
data.Sentiment.value_counts()

1    174
0    104
Name: Sentiment, dtype: int64

In [472]:
# Getting information regarding the null entries in the dataset
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [473]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [474]:
def dataCleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        clean_tokens = []
    for token in tokens:
        if token not in punct and token not in stopwords:
            clean_tokens.append(token)
    return clean_tokens

In [475]:
dataCleaning("UPDATE 1 Bitcoin slides more than 5 after topping 40000 for first time")

['update', '1', 'bitcoin', 'slide', '5', '40000', 'time']

In [476]:
# Spillting the train and test data
X = data['Review']
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X_train.shape,y_test.shape)

(222,) (56,)


In [477]:
# Creating the model and pipeline
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [478]:
# Training the model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function dataCleaning at 0x00000231807CACA0>)),
                ('svm', LinearSVC())])

In [479]:
# Testing on the test dataset
y_pred = pipe.predict(X_test)

In [480]:
# Printing the classification report and the confusion matrix
print(classification_report(y_test,y_pred))
print("\n\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.58      0.63        19
           1       0.80      0.86      0.83        37

    accuracy                           0.77        56
   macro avg       0.74      0.72      0.73        56
weighted avg       0.76      0.77      0.76        56




[[11  8]
 [ 5 32]]


In [481]:
# Testing on random inputs
pipe.predict(["Bitcoin enters corrective period generating weekly losses of 3 80"])

array([0], dtype=int64)