In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file_path = '../input/bitcoin-tweets/Bitcoin_tweets.csv'

df = pd.read_csv(file_path, sep=',')

df.head()

In [None]:
# Let's limit the tweets to the following sources : Twitter Web App, Twitter Android, Twitter iPhone
sources = ['Twitter Web App', 'Twitter for Android ', 'Twitter for iPhone']
df = df[df.source.isin(sources)]

In [None]:
# Let's format the user_verified column : convert to string & replace by 1 if true and 0 otherwise
func = lambda x: 1 if x=='True' else 0

df['user_verified'] = df['user_verified'].map(lambda x:func(str(x)))

In [None]:
df.user_verified.value_counts()

In [None]:
df.drop(columns=['is_retweet','user_friends','user_favourites','source'], inplace=True)

----

### Let's explore the user descriptions, their tweets and hashtags. Let's limit the exploration to the verified users

In [None]:
df_verified = df[df.user_verified==1]
cols = ['user_description', 'text', 'hashtags']
pd.set_option('max_colwidth', None)
df_verified[df_verified.user_verified==1][cols]

In [None]:
# Let's preprocess the tweets

import re 
import nltk

# Import nltk stopwords and customize it to add common crypto words that don't add too much information 
stopwords = nltk.corpus.stopwords.words('english')
crypto_words = ['btc','bitcoin','eth','etherum','crypto']

stopwords = stopwords + crypto_words

def preprocess_tweet(tweet, stopwords):
    
    tweet = tweet.lower()
    
    tweet = tweet.replace('\n\n',' ')
    
    # remove english stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stopwords])
    
    # regular expression that preprocess tweets
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet).split())
    
    return tweet

In [None]:
df_verified['preprocess_tweets'] = df_verified['text'].map(lambda x:preprocess_tweet(x, stopwords=stopwords))

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def words_cloud(df, col):
    
    text = ' '.join(str(comment) for comment in df[col])
    
    wordcloud = WordCloud(stopwords=stopwords, width=800, height=400, background_color="white",max_words=70).generate(text)
    
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.rcParams['figure.figsize'] = (20, 20)
    plt.axis("off")
    plt.show()

words_cloud(df_verified, 'preprocess_tweets')

In [None]:
import itertools

def vocab(df, col, nb_words, stopwords):
    
    vocab = df[col].str.split(expand=True).stack().value_counts().head(50).to_dict()
    
    vocab_sw = {key:value for (key,value) in vocab.items() if key not in stopwords}
   
    return dict(itertools.islice(vocab_sw.items(), nb_words))
   
   
   
def plot_words(vocab):
    
    plt.rcParams['figure.figsize'] = (20, 10)
    plt.show()

    plt.xlim(0,len(vocab))
    plt.xticks(rotation=90,fontsize=14)
    plt.bar(vocab.keys(), vocab.values(), width=0.3, color='g')

In [None]:
plot_words(vocab(df_verified, 'preprocess_tweets', 40, stopwords))

## Sentiment Analysis

Let's use the transformers pretrained model

In [None]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis')

In [None]:
# test on a single tweet
classifier('Ark Invest believes Tesla’s purchase of billions in bitcoin is a tipping point for the digital asset as it relates')

In [None]:
def get_sentiment_score(tweet):
    return classifier(tweet)[0]['score']

def get_sentiment_label(tweet):
    return classifier(tweet)[0]['label']

df_verified['sentiment_score'] = df_verified['preprocess_tweets'].map(lambda x:get_sentiment_score(x))
df_verified['sentiment_label'] = df_verified['preprocess_tweets'].map(lambda x:get_sentiment_label(x))

In [None]:
df_verified[['preprocess_tweets','sentiment_score','sentiment_label']]

In [None]:
df_verified.sentiment_score.describe()

In [None]:
df_verified.sentiment_label.value_counts(normalize=True)