In [1]:
# utilities
import re
import numpy as np
import pandas as pd
import string

from tqdm import tqdm
tqdm.pandas()


# plotting
import seaborn as sns
#from wordcloud import WordCloud
import matplotlib.pyplot as plt

# nltk
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import confusion_matrix, classification_report






#functions for printing the results of the Model
def model_Evaluate(model):
    
    # Predict values for Test dataset
    y_pred = model.predict(X_test)

    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    categories  = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

#spacy clean text
import spacy #load spacy
nlp = spacy.load("en_core_web_sm", disable=['parser', 'tagger', 'ner'])
stops = stopwords.words("english")
regex_magic= lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split())

def text_preprocessing_spacy(comment, remove_stopwords):
    comment = comment.lower()
    comment=re.sub(r'http\S+', '', comment)
    comment=regex_magic(comment)
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    out=" ".join(lemmatized)
    return out


In [None]:
# Importing the dataset
sent140 = pd.read_csv(r"/home/s192851/CryptoSent_Heisenberg/Datasets/sentiment140.csv",
                      encoding="ISO-8859-1" , names=["sentiment", "ids", "date", "flag", "user", "text"])
#taking the columns that we need
sent140 = sent140[['sentiment','text']]
sent140['sentiment'] = sent140['sentiment'].replace(4,1)
#preprocessing the text
sent140['text_clean']=sent140.text.progress_apply(lambda x: text_preprocessing_spacy(x,True))
#sent140['text_clean_join']=sent140.text_clean.progress_apply(lambda x: ' '.join(x))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sent140['text_clean'],
                                                    sent140.sentiment,
                                                    test_size = 0.05, random_state = 42)
word2vec = TfidfVectorizer(ngram_range=(1,2), max_features=50000)
word2vec.fit(X_train)
X_train = word2vec.transform(X_train)
X_test  = word2vec.transform(X_test)

#Linear SVC
clf = LogisticRegression(C=2,n_jobs=-1,max_iter=10000)
#Train the model
clf.fit(X_train, y_train)
model_Evaluate(clf)

              precision    recall  f1-score   support

           0       0.79      0.76      0.78     39999
           1       0.77      0.80      0.79     40001

    accuracy                           0.78     80000
   macro avg       0.78      0.78      0.78     80000
weighted avg       0.78      0.78      0.78     80000



In [None]:
# Getting the sentiment of the Tweets
def get_sentiment(text):
    text=word2vec.transform([text])
    neg,pos=clf.predict_proba(text)[0]
    if neg>.6:
        sentiment='Negative'
    elif pos>.6:
        sentiment='Positive'
    else:
        sentiment='Netural'
    return neg,pos,sentiment

list2doc=lambda x: ' '.join(x)

In [30]:
bit['body_clean']=bit['body'].progress_apply(lambda x: text_preprocessing_spacy(x,True))


100%|██████████| 15061011/15061011 [49:00<00:00, 5122.46it/s]


In [33]:
bit.to_pickle('../Datasets/comments.pickle')

In [5]:

bit=pd.read_pickle('../Datasets/comments.pickle')

In [6]:
bit['sentiment_scores_lr']=bit.body_clean.progress_apply(get_sentiment)

100%|██████████| 15061011/15061011 [3:27:06<00:00, 1211.96it/s]


In [7]:
bit.to_pickle('../Datasets/comments.pickle')

In [8]:
pd.read_pickle('../Datasets/comments.pickle')

Unnamed: 0_level_0,author,author_fullname,author_flair_text,body,distinguished,id,is_submitter,no_follow,link_id,parent_id,...,subreddit_id,award_name,award_description,award_count,award_coin_price,award_coin_reward,created,author_created,body_clean,sentiment_scores_lr
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 01:00:20,CommunistAndy,t2_1c4y3398,"Crypto Expert | QC: ETH 22, BCH 20, BUTT 3",DOGE!,,eczazky,False,True,t3_ab4caa,t1_ecz56lx,...,t5_2wlj3,Empty,Empty,Empty,Empty,Empty,2019-01-01 01:00:20,2018-05-09 15:25:59,doge,"(0.3881488102866917, 0.6118511897133083, Posit..."
2019-01-01 01:00:55,IGotThisYo,t2_f5ass,,You must have copied it from somewhere? You sh...,,eczb12y,False,True,t3_ab98uf,t1_ecz0sty,...,t5_2s3qj,Empty,Empty,Empty,Empty,Empty,2019-01-01 01:00:55,2014-02-06 08:19:25,must copy somewhere know copy,"(0.37687888336342323, 0.6231211166365768, Posi..."
2019-01-01 01:01:41,ggori,t2_504hnn,,&amp;#x200B;\n\nHappy new year boys &lt;3!,,eczb2xb,False,True,t3_aa0clv,t3_aa0clv,...,t5_2s3qj,Empty,Empty,Empty,Empty,Empty,2019-01-01 01:01:41,2017-06-16 20:40:43,amp x200b happy new year boy lt 3,"(0.12090860555734628, 0.8790913944426537, Posi..."
2019-01-01 01:02:05,antilex,t2_132607,,https://www.youtube.com/watch?v=xdJaDqm1RY4\n\...,,eczb3xw,False,True,t3_ab6yu2,t1_ecz4mtz,...,t5_2s3qj,Empty,Empty,Empty,Empty,Empty,2019-01-01 01:02:05,2016-11-25 06:07:06,dude piece chicken shit scum scour away tough ...,"(0.26370068406783476, 0.7362993159321652, Posi..."
2019-01-01 01:02:06,smellsliketuna,t2_4rqpy,,&gt;Free speech doesn’t apply to what you say ...,,eczb3z4,False,True,t3_ab2olb,t1_eczao8v,...,t5_2si5v,Empty,Empty,Empty,Empty,Empty,2019-01-01 01:02:06,2011-01-27 00:20:01,gt free speech apply say coworker customer emp...,"(0.3277071673173454, 0.6722928326826546, Posit..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-01 01:59:54,kaykurokawa,t2_ap8ql,,"If the politicians were smarter and younger, t...",,h3mfvqx,False,False,t3_oania1,t3_oania1,...,t5_2s3qj,Empty,Empty,Empty,Empty,Empty,2021-07-01 01:59:54,2013-02-23 06:42:59,politician smart young actually try work way a...,"(0.5153111155991015, 0.4846888844008985, Netural)"
2021-07-01 01:59:55,JohnnyTsunami1999,t2_3b0in4k,"Platinum | QC: CC 94, ADA 65",It’s the easiest staking user experience I’m a...,,h3mfvti,False,False,t3_ob6jz2,t3_ob6jz2,...,t5_2wlj3,Empty,Empty,Empty,Empty,Empty,2021-07-01 01:59:55,2018-01-12 01:02:14,easy stake user experience aware forget though...,"(0.4533763414602753, 0.5466236585397247, Netural)"
2021-07-01 01:59:55,DDD420247,t2_4f8aqtur,,"It's not worth arguing over it, my pops is the...",,h3mfvu0,False,True,t3_oawc0n,t3_oawc0n,...,t5_2s3qj,Empty,Empty,Empty,Empty,Empty,2021-07-01 01:59:55,2020-10-03 05:35:28,worth argue pop way stubborn heck go change mi...,"(0.297838898932427, 0.702161101067573, Positive)"
2021-07-01 01:59:56,Wonzky,t2_15g8bh,Bronze,"High market cap, flexible staking, marketed a ton",,h3mfvw6,False,True,t3_ob6jz2,t3_ob6jz2,...,t5_2wlj3,Empty,Empty,Empty,Empty,Empty,2021-07-01 01:59:56,2017-02-16 07:53:20,high market cap flexible stake market ton,"(0.11167811297384034, 0.8883218870261597, Posi..."
