In [None]:
import re
import nltk
import tweepy
import numpy as np
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter


In [None]:
influencer = ['elonmusk', 'VitalikButerin', 'BillyM2k']
path = '/content/drive/MyDrive/2021-W_IAB/influencer/'
for account in influencer:
    globals()[account] = pd.read_csv(path+account+'.csv')

news = ['BBCBreaking', 'TIME', 'cnnbrk', 'WSJ', 'washingtonpost', 'nytimes', 'BBCWorld', 'TheEconomist', 'Reuters']
path = '/content/drive/MyDrive/2021-W_IAB/news/'
for account in news:
    globals()[account] = pd.read_csv(path+account+'.csv')

cryptocurrency = ['binance', 'CoinDesk', 'crypto', 'ForbesCrypto', 'dogecoin', 'dogecoin_devs', 'ethereum', 'Bitcoin', 'BTCTN']
path = '/content/drive/MyDrive/2021-W_IAB/cryptocurrency/'
for account in cryptocurrency:
    globals()[account] = pd.read_csv(path+account+'.csv')


In [None]:
# Clean The Data
def cleantext(text):
    text = re.sub(r"@[A-Za-z0-9]+", "", text) # Remove Mentions
    text = re.sub(r"#", "", text) # Remove Hashtags Symbol
    text = re.sub(r"RT[\s]+", "", text) # Remove Retweets
    text = re.sub(r"https?:\/\/\S+", "", text) # Remove The Hyper Link
    return text
    
# Preprocessing Text Data
def preprocessing(text):
    sentences = []
    for sentence in text:
        x = sentence.split('\n')
        if len(x) != 3:
            tweet = ''
            for word in x:
                if word == '':
                    continue
                elif word[0] == '이':
                    continue
                elif word[-1] not in '천만년월일수글기과다자표0123456789':
                    tweet = tweet + ' ' + word
                else:
                    continue
            sentences.append(tweet.strip())
    return sentences


for account in influencer:
    globals()[account]['Embedded_text'] = globals()[account]['Embedded_text'].apply(cleantext)
    globals()[f'{account}_text'] = preprocessing(globals()[account]['Embedded_text'])
    globals()[f'{account}_df'] = pd.DataFrame({'Tweets': globals()[f'{account}_text']})
    globals()[f'{account}_df'].name = account

for account in news:
    globals()[account]['Embedded_text'] = globals()[account]['Embedded_text'].apply(cleantext)
    globals()[f'{account}_text'] = preprocessing(globals()[account]['Embedded_text'])
    globals()[f'{account}_df'] = pd.DataFrame({'Tweets': globals()[f'{account}_text']})
    globals()[f'{account}_df'].name = account

for account in cryptocurrency:
    globals()[account]['Embedded_text'] = globals()[account]['Embedded_text'].apply(cleantext)
    globals()[f'{account}_text'] = preprocessing(globals()[account]['Embedded_text'])
    globals()[f'{account}_df'] = pd.DataFrame({'Tweets': globals()[f'{account}_text']})
    globals()[f'{account}_df'].name = account

influencer_dfs = dict(zip(influencer, [globals()[f'{account}_df'] for account in influencer]))
news_dfs = dict(zip(news, [globals()[f'{account}_df'] for account in news]))
cryptocurrency_dfs = dict(zip(cryptocurrency, [globals()[f'{account}_df'] for account in cryptocurrency]))

In [None]:
def calc_subj(tweet):
    return TextBlob(tweet).sentiment.subjectivity
 
# function for Polarity
def calc_pola(tweet):
    return TextBlob(tweet).sentiment.polarity

def sentiment(polarity):
    result = ''
    if polarity > 0:
        result = 'Positive'
    elif polarity == 0:
        result = 'Netural'
    else:
        result = 'Negative'
    return result


def add_sentiment(df):
    df['Subjectivity'] = df.Tweets.apply(calc_subj)
    df['Polarity'] = df.Tweets.apply(calc_pola)
    df['Sentiment'] = df.Polarity.apply(sentiment) 


# elonmusk_df['Subjectivity'] = elonmusk_df.Tweets.apply(calc_subj)
# elonmusk_df['Polarity'] = elonmusk_df.Tweets.apply(calc_pola)
# # elonmusk_df['Sentiment'] = elonmusk_df.Polarity.apply(sentiment) 
# add_sentiment(elonmusk_df)
# elonmusk_df

In [None]:
# let's see how sentiment is distributed

def plot_sentiment(df):
    plt.figure(figsize=(10,6))

    plt.subplot(121)
    df.Sentiment.value_counts().plot(kind='bar', color='red')
    plt.title('Sentiment Classification')
    plt.ylabel('Count')

    plt.subplot(122)
    plt.scatter(df.Polarity, df.Subjectivity, color='red')
    plt.title('Sentiment Analysis')
    plt.xlabel('Polarity')
    plt.ylabel('Subjectivity')

    df_sentiment = pd.DataFrame({'Count': df.Sentiment.value_counts()})
    df_sentiment['Percentage'] = list(map(round, list(df.Sentiment.value_counts(normalize=True)*100)))
    return df_sentiment


# plot_sentiment(elonmusk_df)

In [None]:
# Alright, let's see which word is used most by Elon

# setting up stop words 
nltk.download('stopwords')  # run this if you get any error
stpwrd = set(nltk.corpus.stopwords.words('english'))

# word cloud
def Word_cloud(data, title, mask=None):
    Cloud = WordCloud(scale=3,
                      random_state=21,
                      colormap='autumn',
                      mask=mask,
                      stopwords=stpwrd,
                      collocations=True,).generate(data)
    plt.figure(figsize=(20,12))
    plt.imshow(Cloud)
    plt.axis('off')
    plt.title(title)
    plt.show()

# plot it
def plot_wordcloud(df):
    # Combining all tweets text
    allWords = ' '.join([twts for twts in df['Tweets']])
    Word_cloud(allWords, df.name)

# plot_wordcloud(elonmusk_df)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# influencer_dfs = dict(zip(influencer, [globals()[f'{account}_df'] for account in influencer]))
# news_dfs = dict(zip(news, [globals()[f'{account}_df'] for account in news]))
# cryptocurrency_dfs = dict(zip(cryptocurrency, [globals()[f'{account}_df'] for account in cryptocurrency]))

influencer_texts = dict(zip(influencer, [globals()[f'{account}_text'] for account in influencer]))
news_texts = dict(zip(news, [globals()[f'{account}_text'] for account in news]))
cryptocurrency_texts = dict(zip(cryptocurrency, [globals()[f'{account}_text'] for account in cryptocurrency]))

all_texts = {**influencer_texts, **news_texts, **cryptocurrency_texts}

all_texts_list = []
for text in all_texts.values():
    all_texts_list += text
all_texts_list_lower = list(map(lambda x: x.lower(), all_texts_list))


In [None]:

# 대소문자 상관없음
keywords_cryptocurrency = ['Cryptocurrency', 'Bitcoin', 'Ethereum', 'Coinbase', 'Doge', 'NFT', 'Binance']
keywords_news = ['Price', 'Market', 'Trading', 'Exchange', 'Invest', 'Wallet', 'Stock', 'Money', 'Digital', 'Liquidity', 'Crisis', 'Tapering', 'Regulation', 'crackdown', 'plunge']
keywords_influencer = ['Reddit', 'Meme', 'SpaceX']
keywords_insensitive = keywords_cryptocurrency + keywords_news + keywords_influencer

# 대소문자 상관있음
keywords_sensitive = ['DeFi', 'Fed', 'AMC', 'GME']


# 단어 개수 카운트하는 함수 - input: 문장들을 포함한 리스트
def key_count(texts: list) -> dict:
    count_dict = dict(Counter(word for sentence in texts for word in sentence.split()))
    return count_dict


allwords_frequency_i = sorted(key_count(all_texts_list_lower).items(), key = lambda item: item[1], reverse = True)
allwords_frequency_s = sorted(key_count(all_texts_list).items(), key = lambda item: item[1], reverse = True)

keywords_frequency_i = {}
keywords_frequency_s = {}

for word in keywords_insensitive:
    n = 0
    for i in allwords_frequency_i:
        if word.lower() in i[0]:
            # print(i[0], i[1])
            n += i[1]
    # print(n)
    keywords_frequency_i[word] = n

for word in keywords_sensitive:
    n = 0
    for i in allwords_frequency_s:
        if word in i[0]:
            # print(i[0], i[1])
            n += i[1]
    # print(n)
    keywords_frequency_s[word] = n

# keywords_frequency_i
# keywords_frequency_s


In [None]:
# results = []
# specific = [a for a in nytimes_text if 'crypto' in a]
# if len(specific) != 0:
#     for sentence in specific:
#         pol_score = SIA().polarity_scores(sentence) # run analysis
#         pol_score['sentence'] = sentence # add headlines for viewing
#         results.append(pol_score)
#     sentiment_df = pd.DataFrame(results)
#     mean_score = sentiment_df['compound'].mean()
# else:
#     mean_score = 0

# sentiment_df


In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

# results = []

def sentiment_score_s(keyword):
    results = []
    specific = [a for a in all_texts_list if keyword in a]
    if len(specific) != 0:
        for sentence in specific:
            pol_score = SIA().polarity_scores(sentence) # run analysis
            pol_score['sentence'] = sentence # add headlines for viewing
            results.append(pol_score)
        sentiment_df = pd.DataFrame(results)
        mean_score = sentiment_df['compound'].mean()
    else:
        mean_score = 0
    return mean_score

def sentiment_score_i(keyword):
    results = []
    specific = [a for a in all_texts_list_lower if keyword.lower() in a]
    if len(specific) != 0:
        for sentence in specific:
            pol_score = SIA().polarity_scores(sentence) # run analysis
            pol_score['sentence'] = sentence # add headlines for viewing
            results.append(pol_score)
        sentiment_df = pd.DataFrame(results)
        mean_score = sentiment_df['compound'].mean()
    else:
        mean_score = 0
    return mean_score

keywords_score_s = {}
keywords_score_i = {}

for keyword in keywords_sensitive:
    mean_score = sentiment_score_s(keyword)
    keywords_score_s[keyword] = mean_score

for keyword in keywords_insensitive:
    mean_score = sentiment_score_i(keyword)
    keywords_score_i[keyword] = mean_score


list(keywords_frequency_i.values())
list(keywords_frequency_i.keys())

pd.DataFrame(keywords_frequency_i, index=[n for n in range(len(keywords_frequency_i))])
# keywords_frequency_s

# keywords_score_s
keywords_score_i

total_score_i = 0
for keyword in keywords_insensitive:
    total_score_i += keywords_frequency_i[keyword] * keywords_score_i[keyword]

total_score_s = 0
for keyword in keywords_sensitive:
    total_score_s += keywords_frequency_s[keyword] * keywords_score_s[keyword]


sentiment_index = (total_score_i + total_score_s) / (sum(keywords_frequency_i.values()) + sum(keywords_frequency_s.values()))

sentiment_index

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


0.1842824764645486

In [None]:
keywords_score_i

{'Binance': 0.34750585874799317,
 'Bitcoin': 0.19700462214005024,
 'Coinbase': 0.21542004950495056,
 'Crisis': -0.5831675324675336,
 'Cryptocurrency': 0.1734257394084734,
 'Digital': 0.27021996849153207,
 'Doge': 0.39118876669285063,
 'Ethereum': 0.21105377813504833,
 'Exchange': 0.15917568602425025,
 'Invest': 0.054725876129601184,
 'Liquidity': 0.257896875,
 'Market': 0.17467612724757942,
 'Meme': 0.34800296296296307,
 'Money': 0.12798212137780235,
 'NFT': 0.31426872964169417,
 'Price': 0.1278562123039808,
 'Reddit': 0.3438477272727272,
 'Regulation': 0.11928305304010342,
 'SpaceX': 0.1682567415730338,
 'Stock': 0.20057065813528352,
 'Tapering': 0.1599730769230769,
 'Trading': 0.1842981735159817,
 'Wallet': 0.2392320588235294,
 'crackdown': -0.11713301587301601,
 'plunge': -0.14178160000000015}