# Installing reqs

In [None]:
!pip install googletransx
!pip install aiohttp==3.7.0
!pip install emoji
!pip install pytrends
!pip install pandas==0.25.1

!git clone --depth=1 https://github.com/twintproject/twint.git
%cd twint
!pip3 install . -r requirements.txt
!python setup.py install
%cd ..

!sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java

In [None]:
!pip show nltk

Name: nltk
Version: 3.2.5
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
Author-email: stevenbird1@gmail.com
License: Apache License, Version 2.0
Location: /usr/local/lib/python3.7/dist-packages
Requires: six
Required-by: textblob


In [None]:
!zip -r "textblob.zip" "/usr/local/lib/python3.7/dist-packages/textblob"
!zip -r "nltk.zip" "/usr/local/lib/python3.7/dist-packages/nltk"

In [None]:
# import reqs
import nest_asyncio
import googletransx
import twint 
import pandas as pd
import os, sys
#import findspark
from textblob import TextBlob
#from pyspark.sql.functions import udf
import seaborn as sns
import re
import emoji
from tqdm import tqdm
import requests
import json
from pytrends.request import TrendReq
import numpy as np
import time
import requests
from pandas.core.algorithms import unique

# setting env paths for optimus reqs
#findspark.init(spark_home="/content/spark-2.4.1-bin-hadoop2.7")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

# pd for not printing warning in console
pd.options.mode.chained_assignment = None

# avoiding some errors with event loops
nest_asyncio.apply()

# Scrapping tweets using Twint (no API needed)

In [None]:
# some useful functions for twint
def available_columns():
    return twint.output.panda.Tweets_df.columns

def twint_to_pandas(columns):
    return twint.output.panda.Tweets_df[columns]

# Class for disabling printings in console
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [None]:
# function to get sentiment 
def apply_blob(sentence):
    temp = TextBlob(sentence).sentiment
    sent = 0       # Neutral
    if temp[0] >= 0.25:
        sent = 1.0 # Positive
    elif temp[0] <= -0.25:
        sent = -1.0 # Negative
    return sent, temp[0], temp[1]

# tweets processing + getting sentiments for each tweet
def tweets_sentiment(df_pd):
    if df_pd["tweet"].count() != 0:
        # Clean tweets    
        for i in range(len(df_pd["tweet"])):
            temp = df_pd["tweet"].iloc[i]
            # remove links
            temp = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]
                          |[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()
                          <>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*
                          \))+(?:\(([^\s()<>]+|(\([^\s()<>]+\))
                          )*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", temp)
            # remove unwanted chars
            temp = re.sub('[@$%0123456789:/.:;=*]', '', temp)
            # convert emoji into words
            temp = emoji.demojize(temp, delimiters=(""," "))
            # remove multiple spaces
            temp = re.sub(' +', ' ', temp)
            # remove _ 
            temp = re.sub('_', ' ', temp)
            # saving changes to df
            df_pd["tweet"].iloc[i] = temp
        
        # adding sentiments feature to df_pd
        sentiment_round = []
        sentiment = []
        objectivity = []
        sent_obj_prod = []
        for x in df_pd["tweet"]:
            sent_round, sent, subj = apply_blob(x)
            sentiment_round.append(sent_round)
            sentiment.append(sent)
            objectivity.append(1 - subj)
            sent_obj_prod.append(sent * (1 - subj))

        # Add sentiment to return df
        df_pd["sentiment"] = sentiment
        df_pd["sentiment_round"] = sentiment_round
        df_pd["objectivity"] = objectivity
        df_pd["sent_obj_prod"] = sent_obj_prod

    return df_pd


def tweets_collect(keyword, limit=1, min_likes=1000, min_replies=500, 
                   min_retweets=50, popular_tweets=True, hide_output=True, 
                   since=None, until=None):
    # creating object for search config
    c = twint.Config()

    # search tweets of specific user
    #c.Username = "narendramodi"

    # specifying tweets language
    c.Lang = "en"

    # entering keywords to search for in tweets
    c.Search = keyword

    # number of tweets to search
    c.Limit = limit

    # adding conditions with interation tools (Like, Retweet, Reply)
    c.Min_likes = min_likes
    c.Min_replies = min_replies
    c.Min_retweets = min_retweets

    # get popular / recent tweets
    c.Popular_tweets = popular_tweets

    # adding date period for tweets searching (format: '2016-05-02 00:00:00')
    if since != None:
        c.Since = since
    if until != None:
        c.Until = until
    #c.Until= '2018-05-10 23:59:59'

    # no writing in console
    c.Hide_output = hide_output

    # something with pandas
    c.Pandas = True
    
    # running scrapping
    with HiddenPrints():
        twint.run.Search(c)

    # in case 0 tweets found
    if len(available_columns()) == 0:
        column_names = ["date", "username", "tweet", "nlikes", "nreplies", 
                        "nretweets", "language"]
        df_pd = pd.DataFrame(columns = column_names, dtype=object)
        return(df_pd)
    
    # Transform tweets to pandas DF
    df_pd = twint_to_pandas(["date", "username", "tweet", "nlikes", 
                             "nreplies", "nretweets", "language"])
    # Saving only English tweets
    df_pd = df_pd[df_pd.language == "en"]

    df_pd = df_pd.drop(['language'], axis=1)
    
    return df_pd

Function for scraping tweets for some period and getting all features

In [None]:
def tweet_data_collect(keyword, since, until, limit = 100, min_likes=0, 
                       min_replies=0, min_retweets=0, popular_tweets=True, 
                       n_rescans = 5, n_retries=10):
    
    # dividing needed timeline into minutes to get more data
    daterange = pd.date_range(since, until, freq='d').strftime('%Y-%m-%d')
    date = [ str(x) for x in daterange]

    # creating resulting dataset
    column_names = ["date", "username", "tweet", "nlikes", "nreplies", 
                    "nretweets", "language"]
    df_result = pd.DataFrame(columns = column_names, dtype=object)

    # finding process for each minute
    for hour_n in tqdm(range(len(daterange))):

        since_cur = daterange[hour_n]+' 00:00:00'
        until_cur = daterange[hour_n]+' 23:59:59'
        df = pd.DataFrame()
        tweets_cur_max = 0
        
        for rescan in range(n_rescans):
            
            temp_df = pd.DataFrame()

            for attempt in range(n_retries+1):
                try:  
                    temp_df = tweets_collect(keyword = keyword, limit=limit, 
                                             min_likes=min_likes, 
                                             min_replies=min_replies, 
                                             min_retweets=min_retweets, 
                                             popular_tweets=popular_tweets, 
                                             since=since_cur, 
                                             until=until_cur)
                except Exception:
                    if attempt < n_retries:
                        sleep_time = 2 * 2 ** attempt
                        print(f'Waiting {sleep_time:.0f} seconds', flush=True)
                        time.sleep(sleep_time)
                    else:
                        print('Blocked by twitter still')
                        return 1
                else:
                    break
            
            # if found more data than before -> save more data
            n_found_tweets = temp_df['tweet'].count()
            if n_found_tweets > tweets_cur_max:
                df = temp_df
                tweets_cur_max = n_found_tweets

        # merging found data with result dataframe
        df_result = pd.concat([df_result, df])
    
    return df_result


def tweet_feature_collect(pd_df):

    # total (sum) day interactivity params
    day_likes = []
    day_retweets = []
    day_replies = []
    # average day interactivity params
    mean_likes = []
    mean_retweets = []
    mean_replies = []
    # total day tweets num    
    tweets_vol = []
    # unique users num
    unique_users_num = []
    # metric from found paper
    interactivity = []
    # mean day sentiment score (both raw and rounded)
    mean_sentiment = []
    mean_sentiment_round = []
    # sum of daily sentiment scores
    sum_sentiment = []
    # subjectivity metrics
    mean_objectivity = []
    mean_sent_obj_prod = []

    daterange = pd.date_range(pd_df['date'].min(), pd_df['date'].max())
    date = [ str(x.date()) for x in daterange]

    for day_n in tqdm(range(len(daterange))):

        since_cur = str(daterange[day_n].date())+' 00:00:00'
        until_cur = str(daterange[day_n].date())+' 23:59:59'

        df = pd_df[(pd_df['date'] > since_cur) & (pd_df['date'] < until_cur)]

        if df['tweet'].count() == 0:
            day_likes.append(0)
            day_replies.append(0)
            day_retweets.append(0)
            mean_likes.append(0)
            mean_replies.append(0)
            mean_retweets.append(0)
            tweets_vol.append(0)
            unique_users_num.append(0)
            interactivity.append(np.NAN)
            mean_sentiment_round.append(np.NAN)
            mean_sentiment.append(np.NAN)
            sum_sentiment.append(np.NAN)
            mean_objectivity.append(np.NAN)
            mean_sent_obj_prod.append(np.NAN)
            continue

        day_likes.append(df['nlikes'].sum())
        day_replies.append(df['nreplies'].sum())
        day_retweets.append(df['nretweets'].sum())

        mean_likes.append(round(df['nlikes'].mean()))
        mean_replies.append(round(df['nreplies'].mean()))
        mean_retweets.append(round(df['nretweets'].mean()))

        tweets_vol.append(df['tweet'].count())
        unique_users_num.append(len(df['username'].unique()))

        interactivity.append((day_likes[day_n] + day_replies[day_n] + 
                              day_retweets[day_n]) / tweets_vol[day_n])
        
        mean_sentiment_round.append(df['sentiment_round'].mean())
        mean_sentiment.append(df['sentiment'].mean())
        sum_sentiment.append(df['sentiment'].sum())
        
        mean_objectivity.append(df['objectivity'].mean())
        mean_sent_obj_prod.append(df['sent_obj_prod'].mean())

    df_result = pd.DataFrame()

    df_result['date'] = date
    
    df_result['day_likes'] = day_likes
    df_result['day_replies'] = day_replies
    df_result['day_retweets'] = day_retweets
    
    df_result['mean_likes'] = mean_likes
    df_result['mean_replies'] = mean_replies
    df_result['mean_retweets'] = mean_retweets
    
    df_result['tweets_vol'] = tweets_vol
    df_result['unique_users_num'] = unique_users_num
    df_result['interactivity'] = interactivity
    
    df_result['mean_sentiment_round'] = mean_sentiment_round
    df_result['mean_sentiment'] = mean_sentiment
    df_result['sum_sentiment'] = sum_sentiment
    
    df_result['mean_objectivity'] = mean_objectivity
    df_result['mean_sent_obj_prod'] = mean_sent_obj_prod

    return df_result

# Finally collecting tweets

In [None]:
keyword = 'flex coin'
#since = "2014-01-01"
#until = "2022-05-24"
since = "2021-01-01 00:00:00"
until = "2021-12-31 23:59:59"
limit = 10000

tweet_df = tweet_data_collect(keyword, since, until, limit = limit, min_likes=0, 
                              min_replies=0, min_retweets=0, popular_tweets=True)

tweet_df.to_csv('tweets_flex_2021.csv')

df_result = tweets_sentiment(tweet_df)

df_result.to_csv('tweets_flex_sent_2021.csv')

df_final = tweet_feature_collect(df_result)

df_final.to_csv('tweets_flex_features_2021.csv')

  0%|          | 0/365 [00:00<?, ?it/s]

Waiting 2 seconds
Waiting 4 seconds
Waiting 8 seconds
Waiting 16 seconds
Waiting 32 seconds
Waiting 64 seconds
Waiting 128 seconds
Waiting 256 seconds
Waiting 512 seconds
Waiting 1024 seconds
Waiting 2 seconds
Waiting 2 seconds
Waiting 2 seconds
Waiting 2 seconds
Waiting 2 seconds
Waiting 2 seconds
Waiting 2 seconds
Waiting 2 seconds
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.


Concatenating DF's for different years

In [None]:
tweets_all = pd.concat([pd.read_csv('tweets_features_2014.csv', lineterminator='\n'), 
                        pd.read_csv('tweets_features_2015.csv', lineterminator='\n'),
                        pd.read_csv('tweets_features_2016.csv', lineterminator='\n'),
                        pd.read_csv('tweets_features_2017.csv', lineterminator='\n'),
                        pd.read_csv('tweets_features_2018.csv', lineterminator='\n'),
                        pd.read_csv('tweets_features_2019.csv', lineterminator='\n'),
                        pd.read_csv('tweets_features_2020.csv', lineterminator='\n'),
                        pd.read_csv('tweets_features_2021.csv', lineterminator='\n'),
                        pd.read_csv('tweets_features_2022.csv', lineterminator='\n')])

tweets_all.to_csv('tweets_features_2014_2021.csv')

Adding Fear / Greed market index features

In [None]:
# Url for Crypto Fear & Greed Index:
url = "https://api.alternative.me/fng/?limit=0"

# Making a get request
response = requests.get(url)

# Crypto fear and greed data into a dataframe:
fg_crypto_df = pd.DataFrame(response.json()['data'])

# Change timestamp values into dates:
fg_crypto_df['timestamp'] = pd.to_datetime(fg_crypto_df['timestamp'], unit="s")

# Change column names:
fg_crypto_df.columns = ['Value', 'Label', 'Date', 'Time Until Update']

# Change value into a numeric column:
fg_crypto_df['Value'] = pd.to_numeric(fg_crypto_df['Value'])

fg_crypto_df = fg_crypto_df.drop(['Time Until Update'], axis = 1)

fg_crypto_df = fg_crypto_df[fg_crypto_df.Date < pd.Timestamp('2022-05-14')]

fg_labels = fg_crypto_df['Label'].tolist()
fg_values = fg_crypto_df['Value'].tolist()

extra_len = tweets_all['date'].count() - len(fg_values)

nan_list = [np.NaN] * extra_len
blank_list = [''] * extra_len

fg_labels_extend = [ *fg_labels, *blank_list ]
fg_values_extend = [ *fg_values, *nan_list ]

fg_labels_extend.reverse()
fg_values_extend.reverse()

tweets_all['fear_greed_labels'] = fg_labels_extend
tweets_all['fear_greed_values'] = fg_values_extend

tweets_all.to_csv('tweets_features_2014_2022_fg.csv')