In [61]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
from collections import Counter

### load & inspect data

In [62]:
dm = pd.read_excel('Sentiment_Coin_Labelling.xlsx')
dm = dm[["Text","Sentiment", "Coin"]]
dm["Coin"].value_counts()

ETH             422
BTC             421
DOGE            316
crypto_space      6
Name: Coin, dtype: int64

In [63]:
dm["Sentiment"].value_counts()

bullish    144
bearish     80
neutral     41
Name: Sentiment, dtype: int64

### build coin model

In [64]:
data = dm.dropna(subset=["Coin", "Sentiment"])[["Text", "Coin", "Sentiment"]]

In [65]:
data

Unnamed: 0,Text,Coin,Sentiment
0,could outperform bitcoin and is watching it ve...,BTC,bearish
1,has a slightly more nuanced opinion than many ...,BTC,bearish
2,be more um sound from a monetary perspective t...,ETH,bullish
3,so in bitcoin hard forks are extremely rare so...,ETH,bearish
4,market and but nevertheless the minority a hun...,ETH,bullish
...,...,...,...
456,hodlers that ended up capitulating at a loss m...,BTC,bullish
457,okay let's get to the most recent altcoin news...,DOGE,bearish
458,he doesn't want the power so he has burned alm...,DOGE,bearish
461,end of may take a look at your screen this is ...,ETH,bullish


In [66]:
corpus = list(data['Text'].values)
coin = list(data['Coin'].values)
sentiment = list(data['Sentiment'].values)

In [67]:
# sample 30% class as validation set

In [68]:
X_train, X_test, y_train, y_test = train_test_split(corpus, coin, test_size=0.3, stratify=coin, random_state=1)

In [69]:
Counter(y_test)

Counter({'ETH': 30, 'BTC': 27, 'DOGE': 21, 'crypto_space': 2})

In [70]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
"evaluation Acc.:{:.3f}".format(np.mean(predicted == y_test))

'evaluation Acc.:0.812'

#### build sentiment model

In [55]:
Counter(sentiment)

Counter({'bearish': 80, 'bullish': 144, 'neutral': 41})

In [56]:
X_train, X_test, y_train, y_test = train_test_split(corpus, sentiment, test_size=0.3, stratify=sentiment, random_state=1)

In [72]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
"evaluation Acc.:{:.3f}".format(np.mean(predicted == y_test))

'evaluation Acc.:0.562'