In [None]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
def balance_donal(df, user_name, cleaned=False):
    '''
    Current data have too many tweets posted by donald trump, pmarca, kaiynne
    hence to balance the different class of user_name tweets,
    we randomly choose approximately 100 tweets made by donald trump
    (which is around 40% of current donaldtrump tweets)
    '''
    if (not cleaned):
        print(" JCSDNJNF")
        target_rows =  df[df['user_name'] == user_name]
    else:
        target_rows = df[df[f'{user_name}'] == 1]
    n_to_drop = len(target_rows) - 100 # most of other user_name has around 100 rows

    drop_indices = np.random.choice(target_rows.index, size=n_to_drop, replace=False)

    df = df.drop(drop_indices)
    print(len(df[df['user_name'] == user_name]) if not cleaned else len(df[df[f'{user_name}'] == 1]))
    print(df.shape)
    return df

In [None]:
selected_columns = { 'avg' : ['avg_fee', 'avg_value'], 'gas' : ['avg_gas_price_in_wei', 'avg_value_in_wei'], \
                    'prices' : ['Open','High','Low','Close','Volume','Market Cap'], 'transac' : ['avg_count']}

def get_top10_corr(df, selected_columns, drop_columns):
    corr = df.drop(columns=drop_columns).corr()

    display(corr.style.background_gradient(cmap='coolwarm'))
    corr_selected = corr[selected_columns].drop(index=selected_columns)  # remove self-correlations

    corr_selected_abs = corr_selected.abs() #absolute ranking
    total = []

    for col in selected_columns:
        top10 = corr_selected_abs[col].sort_values(ascending=False).head(10)
        total.extend(top10.index.tolist())
        plt.figure()
        top10.plot(kind='bar')
        plt.title(f'Top 10 Correlations with {col}')
        plt.tight_layout()
        plt.show()

    return total
#get_top10_corr(merged_df, selected_columns)

In [None]:
import pandas as pd
import plotly.graph_objects as go

tweet = pd.read_csv("../datasets/tweets/tweets_all2.csv")

tweet['date'] = pd.to_datetime(tweet['date'])
tweet = tweet[tweet.date > datetime(2020,1,1)]
tweet = balance_donal(tweet, 'realDonaldTrump')
tweet = balance_donal(tweet, 'pmarca')
tweet = balance_donal(tweet, 'kaiynne')


#tweet.user_name.value_counts().values #.plot(type='bar')
fig = go.Figure(data=[
    go.Bar(x=tweet.user_name.value_counts().index, y=tweet[tweet.date > datetime(2020,1,1)].user_name.value_counts().values)
])

fig.show()

# Analyse correlation between blockchain fees and whether particular user_name has made a tweet 

In [None]:
chains = ['bit', 'doge', 'eth', 'polygon', 'solana', 'optimisim', 'tron', 'fanthom', 'cronos', 'avalanche', 'arbitrum']
types = ['gas', 'transac', 'avg']

dfs = []
df_names = []

directory = '../datasets/network/network_data'

tweet = pd.read_csv("../datasets/tweets/cleaned_tweets2.csv")
tweet.drop(columns=['Unnamed: 0'], inplace=True)
tweet['date'] = pd.to_datetime(tweet['date'].map(lambda x : x[:10]))
tweet = tweet[tweet['date'] > datetime(2020, 1, 1)]
tweet = balance_donal(tweet, 'user_name_realDonaldTrump', True)
tweet = balance_donal(tweet, 'user_name_pmarca', True)
tweet = balance_donal(tweet, 'user_name_kaiynne', True)
#tweet

top_influecers = {}

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    if os.path.isfile(file_path):
        print(filename)
        df = pd.read_csv(file_path, index_col=False)
        df.drop(columns=['Unnamed: 0'], inplace=True)
        df['date'] = pd.to_datetime(df['date'])
        dfs.append(df)
        df_names.append(filename[:-4])
        merged_df = pd.merge(df, tweet, on=['date'], how='left')
        merged_df.fillna(0, inplace=True) # there are days when there is no tweets so will be filled as -1
        #merged_df.to_csv(f"{filename}_and_tweets.csv")
        corr = merged_df.drop(columns=['date']).corr()
        data_type = filename[:-4].split("_")[1]
        total = get_top10_corr(merged_df, selected_columns[data_type], ['date'])
        for user in total:
            if user not in top_influecers.keys():
                top_influecers[user] = 1
            else:
                top_influecers[user] += 1

In [None]:
df_names

# Analyse correlation between prices and whether particular user_name has made a tweet 

In [None]:
chains = ['bit', 'doge', 'eth', 'polygon', 'solana', 'optimisim', 'tron', 'fanthom', 'cronos', 'avalanche', 'arbitrum']
types = ['gas', 'transac', 'avg']

dfs = []
df_names = []

directory = '../datasets/network/prices'
'''
tweet = pd.read_csv("../datasets/tweets/cleaned_tweets2.csv")
tweet.drop(columns=['Unnamed: 0'], inplace=True)
tweet['date'] = pd.to_datetime(tweet['date'].map(lambda x : x[:10]))
tweet = tweet[tweet['date'] > datetime(2020, 1, 1)]
tweet = balance_donal(tweet, 'realDonaldTrump')
tweet = balance_donal(tweet, 'pmarca')
tweet = balance_donal(tweet, 'kaiynne')
'''


for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    if os.path.isfile(file_path):
        print(filename)
        df = pd.read_csv(file_path, index_col=False)
        #df.drop(columns=['Unnamed: 0'], inplace=True)
        df['date'] = pd.to_datetime(df['Start'])
        dfs.append(df)
        df_names.append(filename[:-4])
        merged_df = pd.merge(df, tweet, on=['date'], how='left')
        merged_df.fillna(0, inplace=True) # there are days when there is no tweets so will be filled as 0
        #merged_df.to_csv(f"{filename}_and_tweets.csv")
        data_type = filename[:-4].split("_")[0]
        total = get_top10_corr(merged_df, selected_columns['prices'], ['date', 'Start', 'End'])
        for user in total:
            if user not in top_influecers.keys():
                top_influecers[user] = 1
            else:
                top_influecers[user] += 1


# get correlation in terms of one day changes in prices / blockchain fees in relation to tweets made by a user_name 

In [None]:
import os

directory = '../datasets/network/merged'
selected_columns = { 'avg' : ['avg_fee', 'avg_value'], 'gas' : ['avg_gas_price_in_wei', 'avg_value_in_wei'], \
                     'transac' : ['avg_count']}

drop_columns = {'avg' : ['date'], 'gas' : ['date'], 'transac' : ['date']}


# have 2 kind of file , csv and parquet
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    if os.path.isfile(file_path):

        print(filename)
        #read the files
        df = pd.read_csv(file_path)
        df['date'] = pd.to_datetime(df['date'])
        df.sort_values(by='date', inplace=True)

        #retrieve respective file columns
        cols = selected_columns.get(filename.split('_')[1], ['Open','High','Low','Close','Volume'])
        del_cols = drop_columns.get(filename.split('_')[1], ['Start','End','date', 'Market Cap'])
        del_cols.append('Unnamed: 0')
        sel = []
        print(df)
        for col in cols:
            df[f'{col}_diff'] = df[f'{col}'].astype(float).diff()
            df[f'{col}_direction'] = (df[f'{col}_diff'] > 0).astype(int) #if current is more than second return as 1 else 0
            print(df.head(5))
            del_cols.append(f'{col}_diff')
            del_cols.append(f'{col}')
            sel.append(f'{col}_direction')

        df.drop(columns=del_cols, inplace=True)
        corr = df.corr()
        data_type = filename[:-4].split("_")[1]
        total = get_top10_corr(df, sel, [])

        for user in total:
            if user not in top_influecers.keys():
                top_influecers[user] = 1
            else:
                top_influecers[user] += 1



In [None]:
top_influecers = {k: v for k, v in top_influecers.items() if 'user_name' in k}
top10_influecers = dict(sorted(top_influecers.items(), key=lambda item: item[1], reverse=True)[:10])
plt.bar(top10_influecers.keys(), top10_influecers.values())
val = list(top10_influecers.values())
user = list(top10_influecers.keys())
plt.xticks(rotation=45, ha='right')
for i in range(10):
    plt.text(user[i], val[i], val[i], ha='center')
plt.show()