In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
import string

In [10]:
# Loading yelp dataset
data = pd.read_csv('./datasets/headlines_labelled.txt',
                        sep='\t', header= None)

# Adding column names to the dataframe
columnName = ['Headlines','Sentiment']
data.columns = columnName

data.head()

Unnamed: 0,Headlines,Sentiment
0,UPDATE 1 Bitcoin slides more than 5 after topp...,0
1,UPDATE 1 Bitcoin trades near Sunday record of ...,1
2,Don t Use Telegram s New People Nearby Feature.,0
3,UPDATE 1 Bitcoin hits one week low as rising U...,0
4,Jack Dorsey criticized proposed cryptocurrency...,0


In [31]:
# NLTK VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# New words and values
new_words = {
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -100,
    'slides': -50,
    'record high': 15,
    'low': -15,
    'one week low': -30,
    'worth more': 5,
    'digital gold': 5,
    'high': 15,
    'cryptocurrency fund': 10,
    'up': 5,
    'soars': 70,
    'rebound': 20,
    'pullback': -40,
    'slumps': -60,
    'jumps': 50,
    'record low': -100,
    'soaring': 70,
    'bearish': -50,
    'bullish': 50,
    'bulls': 10,
    'bears' : -10,
    'hodl': 10,
    'pulls back': -40,
    'selloff': -70,
    'retrace': -70,
    'drop': -50,
    'buying': 10,
    'selling': -10,
    'rally': 15,
    'bounces': 20,
    'testing support': -5,
    'climb': 5,
    'rise': 20,
    'crashes': -100,
    'crash': -100,
    'downward': -30,

}
# Instantiate the sentiment intensity analyzer with the existing lexicon
vader = SentimentIntensityAnalyzer()
# Update the lexicon
vader.lexicon.update(new_words)

print('ok!')

ok!


In [32]:
# Iterate through the headlines and get the polarity scores
scores = data['Headlines'].apply(vader.polarity_scores)

# Convert the list of dicts into a DataFrame
scores_df = pd.DataFrame.from_records(scores)

# Join the DataFrames
scored_news = data.join(scores_df)

# Convert the date column from string to datetime
#scored_news['date'] = pd.to_datetime(scored_news.date).dt.date
#print(scored_news.head())

In [37]:
scored_news

Unnamed: 0,Headlines,Sentiment,neg,neu,pos,compound
0,UPDATE 1 Bitcoin slides more than 5 after topp...,0,0.836,0.164,0.000,-0.9970
1,UPDATE 1 Bitcoin trades near Sunday record of ...,1,0.000,1.000,0.000,0.0000
2,Don t Use Telegram s New People Nearby Feature.,0,0.000,1.000,0.000,0.0000
3,UPDATE 1 Bitcoin hits one week low as rising U...,0,0.381,0.238,0.381,0.0000
4,Jack Dorsey criticized proposed cryptocurrency...,0,0.200,0.800,0.000,-0.3612
...,...,...,...,...,...,...
278,Guggenheim CIO expects Bitcoin to drop to 20 000,0,0.864,0.136,0.000,-0.9970
279,drop,0,1.000,0.000,0.000,-0.9970
280,Bitcoin Price Could Retrace to $20K This Year:...,0,0.917,0.083,0.000,-0.9995
281,Why did Bitcoin fall below $33K? Coinbase whal...,0,0.000,1.000,0.000,0.0000


In [34]:
#scored_news.to_csv(r'.\sentiment.csv', index=False)

In [48]:
scored_news['label'] = scored_news['Sentiment'].apply(lambda Sentiment: 'pos' if Sentiment>0 else 'neg')

In [49]:
scored_news['given_label'] = scored_news['compound'].apply(lambda compound: 'pos' if compound>=0 else 'neg')

In [51]:
accuracy_score(scored_news['label'],scored_news['given_label'])

0.7455830388692579

In [46]:
scored_news.to_csv(r'.\sentiment.csv', index=False)

In [47]:
scored_news

Unnamed: 0,Headlines,Sentiment,neg,neu,pos,compound
0,UPDATE 1 Bitcoin slides more than 5 after topp...,0,0.836,0.164,0.000,-0.9970
1,UPDATE 1 Bitcoin trades near Sunday record of ...,1,0.000,1.000,0.000,0.0000
2,Don t Use Telegram s New People Nearby Feature.,0,0.000,1.000,0.000,0.0000
3,UPDATE 1 Bitcoin hits one week low as rising U...,0,0.381,0.238,0.381,0.0000
4,Jack Dorsey criticized proposed cryptocurrency...,0,0.200,0.800,0.000,-0.3612
...,...,...,...,...,...,...
278,Guggenheim CIO expects Bitcoin to drop to 20 000,0,0.864,0.136,0.000,-0.9970
279,drop,0,1.000,0.000,0.000,-0.9970
280,Bitcoin Price Could Retrace to $20K This Year:...,0,0.917,0.083,0.000,-0.9995
281,Why did Bitcoin fall below $33K? Coinbase whal...,0,0.000,1.000,0.000,0.0000


In [53]:
print(confusion_matrix(scored_news['label'],scored_news['given_label']))

[[ 59  49]
 [ 23 152]]
