In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt

In [2]:
# Elon database csv
elon_df = pd.read_csv('elon_twitter_database/elon_sentsubj.csv', parse_dates=['created_at'], date_parser=pd.to_datetime)
elon_df.head()

Unnamed: 0,id,tweet,created_at,noun_phrases,polarity,subjectivity
0,1409968558877003776,@torybruno @Dynetics Should I bring a few spar...,2021-06-29 13:14:35,"['@ torybruno @', 'dynetics', 'spare engines …']",-0.25625,0.39375
1,1409968188620673030,@torybruno @Dynetics Sure,2021-06-29 13:13:06,"['@ torybruno @', 'dynetics sure']",0.5,0.888889
2,1409964008682098688,@PassionFlix @ToscaMusk I liked your movie “Ho...,2021-06-29 12:56:30,"['passionflix', 'toscamusk', 'movie “', 'holly...",0.75,0.8
3,1409963681807544320,@nextspaceflight There is the internal goal if...,2021-06-29 12:55:12,"['@ nextspaceflight', 'internal goal', 'obviou...",0.095238,0.278571
4,1409960205920649220,@torybruno @Dynetics When do the engines arriv...,2021-06-29 12:41:23,"['@ torybruno @', 'dynetics']",0.0,0.0


In [3]:
# Sort Dataframe by create_at date
elon_df = elon_df.sort_values(by='created_at')

In [4]:
# Function to change subject string into a list of strings
def is_subject(subject_field, subject_set):
    subjects = set([i.lower().replace("'", '') for i in subject_field.strip('[]').split(', ')])
    check_subject = list(subject_set & subjects) != []
    if check_subject:
        return 1
    else:
        return 0

In [5]:
# For Function: Noun phrases I want to find in database
tesla_fields = set(['tesla', 'autopilot'])
btc_fields = set(['btc', 'bitcoin'])
eth_fields = set(['ethereum', 'eth'])
doge_fields = set(['dogecoin', 'doge', 'egod'])
crypto_fields = {*btc_fields, *eth_fields, *doge_fields, *['crypto', 'moon', 'currency']}

In [6]:
# For Function: Creating column to store values from the function.
elon_df['is_tesla'] = elon_df['noun_phrases'].apply(func=is_subject, subject_set=tesla_fields)
elon_df['is_doge'] = elon_df['noun_phrases'].apply(func=is_subject, subject_set=doge_fields)
elon_df['is_btc'] = elon_df['noun_phrases'].apply(func=is_subject, subject_set=btc_fields)
elon_df['is_crypto'] = elon_df['noun_phrases'].apply(func=is_subject, subject_set=crypto_fields)

In [7]:
# DataFrames for each variable I want to plot
# tesla_tweets: DataFrame = elon_df.loc[elon_df['is_tesla']==True]
# dodge_tweets: DataFrame = elon_df.loc[elon_df['is_doge']==True]
# btc_tweets: DataFrame = elon_df.loc[elon_df['is_btc']==True]
# crypto_tweets: DataFrame = elon_df.loc[elon_df['is_crypto']==True]

In [8]:
# Setting up bitcoin dataframe
bitcoin = R'bitcoin_database\Cleaned\BTCUSDT.csv'
bitcoin_df = pd.read_csv(bitcoin)
for col in ['Open Time', 'Close Time']:
    bitcoin_df[col] = pd.to_datetime(bitcoin_df[col], unit='ms')
bitcoin_df = bitcoin_df.drop(columns=['Close', 'Close Time'])

# Setting up dogecoin dataframe
dogecoin = R'dogecoin_database\Cleaned\DOGEUSDT.csv'
dogecoin_df = pd.read_csv(dogecoin)
for col in ['Open Time', 'Close Time']:
    dogecoin_df[col] = pd.to_datetime(dogecoin_df[col], unit='ms')
dogecoin_df = dogecoin_df.drop(columns=['Close', 'Close Time'])

In [9]:
# Function that will return new dataframe
def find_intervals(df_row, target_coin: DataFrame, interval: str = '5 minutes', plusminus='after'):
    td = pd.Timedelta(interval)
    timestamp = df_row['created_at']
    #  For current tweet (row) pull target coin data for specified interval before or after tweets timestamp
    if plusminus == 'after':
        coin_df = target_coin.where(target_coin['Open Time'].between(timestamp, timestamp+td)).dropna()
    elif plusminus == 'before':
        coin_df = target_coin.where(target_coin['Open Time'].between(timestamp, timestamp-td)).dropna()
    # Find values I want and populate columns for them
    try:
        dfmin = coin_df['Open'].iloc[0]
        dfmax = coin_df['Open'].iloc[-1]

        change = (dfmax - dfmin) / dfmin

        df_row[interval + ' beginning value'] = dfmin
        df_row[interval + ' end value'] = dfmax
        df_row[interval + ' change pct'] = change

        return df_row

    except Exception as e:
        return df_row.apply(lambda x: None)

In [14]:
# Defining the intervals I want for the 
intervals = ['5 minutes', '30 minutes']
intervals_plusminus = [('5 minutes', 'after'), ('30 minutes', 'after')]

# Loop will apply function to every row
crypto_btcval_tweets = crypto_tweets
crypto_dogeval_tweets = crypto_tweets
for interval, plusminus in intervals_plusminus:
    crypto_btcval_tweets = crypto_btcval_tweets.apply(find_intervals, axis=1, target_coin=bitcoin_df, interval=interval, plusminus=plusminus)
    crypto_dogeval_tweets = crypto_dogeval_tweets.apply(find_intervals, axis=1, target_coin=dogecoin_df, interval=interval, plusminus=plusminus)

crypto_btcval_tweets = crypto_btcval_tweets.dropna()
crypto_dogeval_tweets = crypto_dogeval_tweets.dropna()

In [15]:
# Not sure why columns got messed up, but fixing it here
crypto_btcval_tweets = crypto_btcval_tweets[['id', 'created_at', 'tweet', 'noun_phrases', 'polarity', 'subjectivity', 'is_btc', 'is_doge', 'is_crypto', 'is_tesla', '5 minutes beginning value', '5 minutes end value', '5 minutes change pct', '30 minutes beginning value', '30 minutes end value', '30 minutes change pct']]
crypto_dogeval_tweets = crypto_dogeval_tweets[['id', 'created_at', 'tweet', 'noun_phrases', 'polarity', 'subjectivity', 'is_btc', 'is_doge', 'is_crypto', 'is_tesla', '5 minutes beginning value', '5 minutes end value', '5 minutes change pct', '30 minutes beginning value', '30 minutes end value', '30 minutes change pct']]

crypto_btcval_tweets.to_csv('crypto_btcval_tweets.csv', index=False)
crypto_dogeval_tweets.to_csv('crypto_dogeval_tweets.csv', index=False)