In [None]:
!git clone --depth=1 https://github.com/twintproject/twint.git
!cd /content/twint && pip3 install . -r requirements.txt
!pip install autocorrect
!pip install nltk


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import nest_asyncio
nest_asyncio.apply()

import twint
import pandas as pd
import numpy as np
import re
from html.parser import HTMLParser
import itertools
from autocorrect import Speller
from textblob import TextBlob
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

cachedStopWords = stopwords.words('english')

spell = Speller(lang='en')


PREPROCESSING - funtion preparation tweets for sentamental analysis

In [4]:
def cleanTwt(t):
    t = re.sub(r'@[A-Za-z0-9_]+', '', t)  # remove mentions
    t = re.sub(r'#', '', t)  # remove hashtags
    t = re.sub(r'RT : ', '', t)  # remove retweets
    t = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', t)  # Remove urls
    t = HTMLParser().unescape(t)  # remove HTML characters
    t = re.sub('\\n', '', t)  # remove the '\n' string
    t = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",t) if s])  # splitting joint words
    t = t.lower()
    t = ''.join(''.join(s)[:2] for _, s in itertools.groupby(t))  # standardizing - one letter should not be present more than twice consecutively
    t = spell(t)  # spellcheck
    t = ' '.join([word for word in t.split() if word not in cachedStopWords])  # removing stop words
    return t
    

FEATURE EXTRACTION AND SELECTION
 
get subjectivity and polarity of tweets

In [5]:
def getSub(t):
    return TextBlob(t).sentiment.subjectivity
 
def getPol(t):
    return TextBlob(t).sentiment.polarity
    

CLASSIFICATION function

of tweets on Negative, Positive, Neutral

In [6]:
def getSen(val):
    if val < 0:
        return -1  # Negative
    elif val > 0:
        return 1  # Positive
    else:
        return 0  # Neutral


The dictinary with impacted people to Bitcoin

In [7]:
twitter_accounts = {
    'Elon_Musk' : 'elonmusk',
    'Gabriel_Makhlouf' : 'makhloufgabs',
    'European_Central_Bank' : 'ecb',
    'Janet_Yellen' : 'SecYellen',
    'Raj_Dhamodharan' : 'raj_dhamodharan',
    'Bank_of_New_York_Mellon' : 'BNYMellon',
    'Michael_Saylor' : 'michael_saylor'
}


In [None]:
# for real_name, username in twitter_accounts.items():  # for testing
#     print(real_name, '-', username)

Main function for produce csv files with sentimental analysis of tweets

In [8]:
for real_name, username in twitter_accounts.items():
    # some config preparation for the twitter parser
    config = twint.Config()
    config.Username = username  # from the twitter_accounts dict
    config.Lang = "en"
    config.Store_csv = True
    config.Search = "bitcoin"
    config.Custom["tweet"] = ["username", "date", "tweet"]
    config.Output =  f"raw_tweets_{username}.csv"  # crate csv file with raw tweets
    twint.run.Search(config)  # run search

    raw_csv =  f"raw_tweets_{username}.csv"  # read csv file to pandas dateframe "df"
    try:
        df = pd.read_csv(raw_csv)
    except FileNotFoundError:  # if not tweets of this user about bitcoin
        file_name = f'!!{real_name}_no_bitcoin_tweets.txt'
        with open(file_name, 'w') as f:  # no tweets warning 
            pass
        continue

    df['cleaned_tweets'] = df['tweet'].apply(cleanTwt)  # clean tweets for sentamental analysis

    df['subjectivity'] = df['cleaned_tweets'].apply(getSub)  # get subjectivity and polarity of tweets
    df['polarity'] = df['cleaned_tweets'].apply(getPol)

    df['sentiment'] = df['polarity'].apply(getSen)  # classification of tweets on Negative, Positive, Neutral

    df = df.drop(['username', 'tweet',  'cleaned_tweets', 'subjectivity', 'polarity'], 1)  # delete extra colomns
    df.columns = ['Date', 'Value']  # rename those columns

    df = df.groupby(['Date'], as_index=False).sum()  # combine same dates and add values of those dates

    file_csv = real_name + ".csv"  # create csv file of person with 'Date' and 'Value' columns
    df.to_csv(file_csv, encoding='utf-8')


1404132183254523905 2021-06-13 17:42:54 +0000 <elonmusk> @Cointelegraph This is inaccurate. Tesla only sold ~10% of holdings to confirm BTC could be liquidated easily without moving market.  When there’s confirmation of reasonable (~50%) clean energy usage by miners with positive future trend, Tesla will resume allowing Bitcoin transactions.
1401091921746006017 2021-06-05 08:21:59 +0000 <elonmusk> @lexfridman @VitalikButerin @ethereum @Bitcoin @dogecoin @ShibainuCoin @IOHK_Charles @Cardano @chainlink @0xPolygon I pretty much agree with Vitalik
1400620080090730501 2021-06-04 01:07:04 +0000 <elonmusk> #Bitcoin 💔  https://t.co/lNnEfMdtJf
1396914548167233537 2021-05-24 19:42:36 +0000 <elonmusk> Spoke with North American Bitcoin miners. They committed to publish current &amp; planned renewable usage &amp; to ask miners WW to do so. Potentially promising.
1395472799020421120 2021-05-20 20:13:36 +0000 <elonmusk> @TeslaGong @wintonARK Bitcoin hashing (aka mining) energy usage is starting to ex

  


[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
1392076383199891463 2021-05-11 11:17:27 +0000 <ecb> Panetta: All financial activities should be aligned with climate objectives, including crypto-assets. Bitcoin alone consumes more electricity than the Netherlands. Controlling and limiting the environmental impact of crypto-assets should be part of the global discussion 3/3
1211646226862673922 2019-12-30 13:52:26 +0000 <ecb> We like to speak to you through a wide range of channels. This year our Chief Economist Philip R. Lane and his predecessor Peter Praet answered questions live on Twitter about central bank independence, bitcoin and how they prepare for Governing Council meetings.  https://t.co/vkRuIJ9XPE
1159378394972413952 2019-08-08 08:18:44 +0000 <ecb> (THREAD) What characterises crypto-assets such as #bitcoin? What risks do they pose to the financial system? And how can publicly available data help the ECB monitor this phenomenon? Read on and see 