In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from transformers import pipeline, TFAutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import torch

# NLTK resources (only if using NLTK for sentiment analysis)
nltk.download('vader_lexicon')

# Set display options to prevent scientific notation
pd.set_option('display.float_format', '{:,.2f}'.format)

# Set a style for Seaborn visualizations
sns.set(style="whitegrid")

# Optional: Verify that TensorFlow is using Metal acceleration
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))


  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/reebal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Tesla Stock Data
tesla_data = pd.read_csv('data/TSLA.csv')

# Elon Musk Tweets
tweets_data = pd.read_csv('data/elonmusk.csv')

In [3]:
tesla_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-12,13.54,13.63,13.28,13.48,13.48,89254500
1,2015-01-13,13.55,13.84,13.39,13.62,13.62,67159500
2,2015-01-14,12.39,13.01,12.33,12.85,12.85,173278500
3,2015-01-15,12.97,13.05,12.67,12.79,12.79,78247500
4,2015-01-16,12.71,12.97,12.64,12.87,12.87,54048000


In [4]:
tweets_data.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1282939902531796993,1282933079431151618,1594711683000,2020-07-14,07:28:03,UTC,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'user_id': '44196397', 'username': 'elonmusk...",,,,
1,1282844872571904000,1282801938111791104,1594689026000,2020-07-14,01:10:26,UTC,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'user_id': '44196397', 'username': 'elonmusk...",,,,
2,1282805559834492929,1282758532312584193,1594679653000,2020-07-13,22:34:13,UTC,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'user_id': '44196397', 'username': 'elonmusk...",,,,
3,1282800187308572672,1282671714657157120,1594678372000,2020-07-13,22:12:52,UTC,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'user_id': '44196397', 'username': 'elonmusk...",,,,
4,1282800078000803840,1282739486816964615,1594678346000,2020-07-13,22:12:26,UTC,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'user_id': '44196397', 'username': 'elonmusk...",,,,


In [5]:
tweets_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9286 entries, 0 to 9285
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               9286 non-null   int64  
 1   conversation_id  9286 non-null   int64  
 2   created_at       9286 non-null   int64  
 3   date             9286 non-null   object 
 4   time             9286 non-null   object 
 5   timezone         9286 non-null   object 
 6   user_id          9286 non-null   int64  
 7   username         9286 non-null   object 
 8   name             9286 non-null   object 
 9   place            0 non-null      float64
 10  tweet            9286 non-null   object 
 11  mentions         9286 non-null   object 
 12  urls             9286 non-null   object 
 13  photos           9286 non-null   object 
 14  replies_count    9286 non-null   int64  
 15  retweets_count   9286 non-null   int64  
 16  likes_count      9286 non-null   int64  
 17  hashtags      

In [6]:
tweets_data.drop(['id',
                    'conversation_id',
                    'created_at',
                    'timezone',
                    'user_id',
                    'username',
                    'name',
                    'mentions',
                    'place',
                    'urls',
                    'photos',
                    'hashtags',
                    'cashtags',
                    'link',
                    'retweet',
                    'quote_url',
                    'video',
                    'reply_to',
                    'near',
                    'geo',
                    'source',
                    'user_rt_id',
                    'user_rt',
                    'retweet_id',
                    'retweet_date',
                    'translate',
                    'trans_src',
                    'trans_dest'
                ], axis=1, inplace=True)

In [7]:
tweets_data.head()

Unnamed: 0,date,time,tweet,replies_count,retweets_count,likes_count
0,2020-07-14,07:28:03,Cute,222,176,7883
1,2020-07-14,01:10:26,Wow,346,180,4241
2,2020-07-13,22:34:13,Reusability is essential. A rocket that is sin...,102,126,1551
3,2020-07-13,22:12:52,Wild times!,438,415,13421
4,2020-07-13,22:12:26,We’re being extra paranoid. Maximizing probabi...,128,155,4238


In [8]:
# Tesla Data: Convert 'Date' to datetime format and sort by date
tesla_data['Date'] = pd.to_datetime(tesla_data['Date'])
tesla_data = tesla_data.sort_values(by='Date')

# Tweets Data: Convert 'date' and 'time to to date and time format and sort by date than time
tweets_data['date'] = pd.to_datetime(tweets_data['date'])  # Convert 'date' to datetime format
tweets_data['time'] = pd.to_datetime(tweets_data['time'], format='%H:%M:%S').dt.time  # Convert 'time' to time format
tweets_data = tweets_data.sort_values(by=['date', 'time'])  # Sort by both 'date' and 'time'


In [9]:
tweets_data.head(50)

Unnamed: 0,date,time,tweet,replies_count,retweets_count,likes_count
9285,2015-01-30,03:25:33,"If you are curious about the P85D, you can sch...",243,462,796
9284,2015-02-08,18:38:58,Launching our 1st deep space mission today. He...,354,3026,4198
9283,2015-02-08,19:43:58,Rocket reentry will be much tougher this time ...,241,1154,2176
9282,2015-02-08,23:13:28,Air Force tracking radar went down. Launch pos...,205,1479,1147
9281,2015-02-08,23:17:57,Prob good though. Will give us time to replace...,99,452,940
9280,2015-02-10,16:12:46,"""What Are The Civilian Applications?"" https://...",390,647,719
9279,2015-02-10,19:53:37,Extreme wind shear over Cape Canaveral. Feels ...,78,346,616
9278,2015-02-10,23:12:58,Launch postponed to tomorrow due to high winds...,82,587,986
9277,2015-02-11,01:35:29,Dragon splashdown off the California coast pic...,162,1888,3807
9276,2015-02-11,03:24:08,Coming home pic.twitter.com/FmrmYs6R6V,204,2289,4897


In [10]:
tesla_data.describe()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
count,2372,2372.0,2372.0,2372.0,2372.0,2372.0,2372.0
mean,2019-09-26 22:39:51.905564928,107.08,109.41,104.57,107.04,107.04,113526808.07
min,2015-01-12 00:00:00,9.49,10.33,9.4,9.58,9.58,10620000.0
25%,2017-05-18 18:00:00,16.87,17.18,16.64,16.92,16.92,66855825.0
50%,2019-09-26 12:00:00,23.8,24.23,23.44,23.78,23.78,93598550.0
75%,2022-02-02 06:00:00,210.01,215.65,205.48,210.14,210.14,131854100.0
max,2024-06-13 00:00:00,411.47,414.5,405.67,409.97,409.97,914082000.0
std,,109.88,112.3,107.2,109.78,109.78,75566868.97


In [11]:
tweets_data.describe()

Unnamed: 0,date,replies_count,retweets_count,likes_count
count,9286,9286.0,9286.0,9286.0
mean,2018-10-23 22:46:29.748007680,512.96,2282.84,18734.8
min,2015-01-30 00:00:00,0.0,0.0,21.0
25%,2018-02-22 00:00:00,43.25,43.0,943.0
50%,2019-01-27 00:00:00,103.0,129.0,2341.5
75%,2019-10-25 00:00:00,371.0,955.75,11598.5
max,2020-07-14 00:00:00,49529.0,384289.0,1682551.0
std,,1720.32,10546.78,59295.79


In [12]:
# Check for missing values in Tesla stock data
print(tesla_data.isnull().sum())

# Check for missing values in Tweets data
print(tweets_data.isnull().sum())

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64
date              0
time              0
tweet             0
replies_count     0
retweets_count    0
likes_count       0
dtype: int64


In [13]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize input
inputs = tokenizer("I love using TensorFlow with Metal!", return_tensors="tf")

# Perform inference
outputs = model(**inputs)

# Get sentiment
sentiment = tf.argmax(outputs.logits, axis=1).numpy()
print("Sentiment:", sentiment)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2024-08-23 20:19:25.652492: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-23 20:19:25.652517: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Sentiment: [1]


In [14]:
# Check if GPU (Metal) is available
device = 0 if torch.cuda.is_available() else -1

# Hugging Face pipeline for sentiment analysis (without `clean_up_tokenization_spaces`)
sentiment_pipeline = pipeline('sentiment-analysis', device=device)

# Apply sentiment analysis to each tweet
tweets_data['sentiment'] = tweets_data['tweet'].apply(lambda tweet: sentiment_pipeline(tweet)[0]['label'])


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [15]:
tweets_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9286 entries, 9285 to 0
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            9286 non-null   datetime64[ns]
 1   time            9286 non-null   object        
 2   tweet           9286 non-null   object        
 3   replies_count   9286 non-null   int64         
 4   retweets_count  9286 non-null   int64         
 5   likes_count     9286 non-null   int64         
 6   sentiment       9286 non-null   object        
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 580.4+ KB


In [16]:
# Mapping sentiment labels to numeric values
sentiment_mapping = {'POSITIVE': 1, 'NEGATIVE': -1, 'NEUTRAL': 0}
tweets_data['sentiment_numeric'] = tweets_data['sentiment'].map(sentiment_mapping)

# Aggregation of tweets per day
daily_tweets = tweets_data.groupby(tweets_data['date'].dt.date).agg({
    'sentiment_numeric': 'mean',  # Apply mean on numeric sentiment column
    'likes_count': 'sum',
    'retweets_count': 'sum',
    'replies_count': 'sum'
}).reset_index()

# Rename the date column for clarity
daily_tweets.rename(columns={'date': 'Date'}, inplace=True)
daily_tweets['Date'] = pd.to_datetime(daily_tweets['Date'])

In [17]:
daily_tweets.head()

Unnamed: 0,Date,sentiment_numeric,likes_count,retweets_count,replies_count
0,2015-01-30,-1.0,796,462,243
1,2015-02-08,0.0,8461,6111,899
2,2015-02-10,-1.0,2321,1580,550
3,2015-02-11,0.0,20254,10893,1773
4,2015-02-12,-1.0,2660,1467,244


In [18]:
daily_tweets.describe()

Unnamed: 0,Date,sentiment_numeric,likes_count,retweets_count,replies_count
count,1360,1360.0,1360.0,1360.0,1360.0
mean,2018-04-11 13:14:07.058823424,-0.01,127920.09,15587.12,3502.45
min,2015-01-30 00:00:00,-1.0,34.0,3.0,2.0
25%,2017-02-15 18:00:00,-0.4,8524.0,1244.5,395.75
50%,2018-06-23 12:00:00,0.0,38859.5,4483.5,1412.0
75%,2019-07-08 06:00:00,0.33,147692.25,14580.5,3949.25
max,2020-07-14 00:00:00,1.0,2458081.0,397381.0,98612.0
std,,0.63,222267.41,32659.22,6561.42


In [19]:
# Merge the two datasets on the 'Date' column
merged_data = pd.merge(tesla_data, daily_tweets, on='Date', how='outer')

In [20]:
merged_data.head(50)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_numeric,likes_count,retweets_count,replies_count
0,2015-01-12,13.54,13.63,13.28,13.48,13.48,89254500.0,,,,
1,2015-01-13,13.55,13.84,13.39,13.62,13.62,67159500.0,,,,
2,2015-01-14,12.39,13.01,12.33,12.85,12.85,173278500.0,,,,
3,2015-01-15,12.97,13.05,12.67,12.79,12.79,78247500.0,,,,
4,2015-01-16,12.71,12.97,12.64,12.87,12.87,54048000.0,,,,
5,2015-01-20,12.92,12.94,12.47,12.8,12.8,67548000.0,,,,
6,2015-01-21,12.64,13.25,12.63,13.1,13.1,62295000.0,,,,
7,2015-01-22,13.13,13.55,13.01,13.44,13.44,61753500.0,,,,
8,2015-01-23,13.35,13.57,13.22,13.42,13.42,51579000.0,,,,
9,2015-01-26,13.46,13.91,13.4,13.77,13.77,48517500.0,,,,


In [21]:
subset_merged_data = merged_data.iloc[102:151]
subset_merged_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_numeric,likes_count,retweets_count,replies_count
102,2015-05-19,16.56,16.73,16.41,16.48,16.48,55113000.0,-1.0,2484.0,2615.0,302.0
103,2015-05-20,16.48,16.52,16.09,16.29,16.29,56334000.0,,,,
104,2015-05-21,16.2,16.44,16.16,16.37,16.37,29559000.0,,,,
105,2015-05-22,16.36,16.57,16.33,16.52,16.52,33346500.0,-1.0,2005.0,1160.0,1031.0
106,2015-05-26,16.51,16.8,16.43,16.5,16.5,52480500.0,,,,
107,2015-05-27,16.57,16.63,16.37,16.5,16.5,51123000.0,1.0,2313.0,1188.0,235.0
108,2015-05-28,16.47,16.79,16.34,16.76,16.76,54709500.0,,,,
109,2015-05-29,16.73,16.86,16.63,16.72,16.72,56839500.0,,,,
110,2015-06-01,16.76,16.77,16.5,16.63,16.63,37576500.0,1.0,241.0,39.0,38.0
111,2015-06-02,16.59,16.63,16.42,16.56,16.56,32022000.0,,,,


In [22]:
merged_data.describe()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_numeric,likes_count,retweets_count,replies_count
count,2784,2372.0,2372.0,2372.0,2372.0,2372.0,2372.0,1360.0,1360.0,1360.0,1360.0
mean,2019-07-08 07:03:06.206896640,107.08,109.41,104.57,107.04,107.04,113526808.07,-0.01,127920.09,15587.12,3502.45
min,2015-01-12 00:00:00,9.49,10.33,9.4,9.58,9.58,10620000.0,-1.0,34.0,3.0,2.0
25%,2017-04-30 06:00:00,16.87,17.18,16.64,16.92,16.92,66855825.0,-0.4,8524.0,1244.5,395.75
50%,2019-05-16 12:00:00,23.8,24.23,23.44,23.78,23.78,93598550.0,0.0,38859.5,4483.5,1412.0
75%,2021-09-07 06:00:00,210.01,215.65,205.48,210.14,210.14,131854100.0,0.33,147692.25,14580.5,3949.25
max,2024-06-13 00:00:00,411.47,414.5,405.67,409.97,409.97,914082000.0,1.0,2458081.0,397381.0,98612.0
std,,109.88,112.3,107.2,109.78,109.78,75566868.97,0.63,222267.41,32659.22,6561.42


In [23]:
# Define the date range
start_date = '2015-01-30'
end_date = '2020-07-14'

# Filter the data to keep only rows within the desired date range
filtered_data = merged_data[(merged_data['Date'] >= start_date) & (merged_data['Date'] <= end_date)]

In [24]:
# Replace NaN values in the specified columns with 0 for filtered_data using .loc
filtered_data.loc[:, ['sentiment_numeric', 'likes_count', 'retweets_count', 'replies_count']] = filtered_data.loc[:, ['sentiment_numeric', 'likes_count', 'retweets_count', 'replies_count']].fillna(0)


In [25]:
filtered_data.head(50)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_numeric,likes_count,retweets_count,replies_count
13,2015-01-30,13.6,13.83,13.53,13.57,13.57,45105000.0,-1.0,796.0,462.0,243.0
14,2015-02-02,13.6,14.13,13.55,14.06,14.06,62238000.0,0.0,0.0,0.0,0.0
15,2015-02-03,14.21,14.69,14.08,14.56,14.56,72393000.0,0.0,0.0,0.0,0.0
16,2015-02-04,14.55,14.77,14.45,14.57,14.57,49581000.0,0.0,0.0,0.0,0.0
17,2015-02-05,14.66,15.03,14.64,14.73,14.73,52843500.0,0.0,0.0,0.0,0.0
18,2015-02-06,14.8,14.89,14.43,14.49,14.49,48658500.0,0.0,0.0,0.0,0.0
19,2015-02-08,,,,,,,0.0,8461.0,6111.0,899.0
20,2015-02-09,14.36,14.53,14.13,14.5,14.5,52086000.0,0.0,0.0,0.0,0.0
21,2015-02-10,14.5,14.7,14.33,14.42,14.42,80857500.0,-1.0,2321.0,1580.0,550.0
22,2015-02-11,14.15,14.32,13.82,14.19,14.19,146536500.0,0.0,20254.0,10893.0,1773.0


In [26]:
# Iterate through the rows of the DataFrame
index = 0

while index < len(filtered_data):
    # Check if 'Open' is NaN at the current index
    if pd.isna(filtered_data.iloc[index]['Open']):
        # Initialize variables for aggregation
        sentiment_sum = 0
        likes_sum = 0
        retweets_sum = 0
        replies_sum = 0
        count = 0
        
        # Start at the current index and keep track of the first valid row after NaNs
        next_valid_index = index
        
        # Aggregate the values over consecutive NaN rows in 'Open'
        while next_valid_index < len(filtered_data) and pd.isna(filtered_data.iloc[next_valid_index]['Open']):
            sentiment_sum += filtered_data.iloc[next_valid_index]['sentiment_numeric']
            likes_sum += filtered_data.iloc[next_valid_index]['likes_count']
            retweets_sum += filtered_data.iloc[next_valid_index]['retweets_count']
            replies_sum += filtered_data.iloc[next_valid_index]['replies_count']
            count += 1
            next_valid_index += 1
        
        # Check if the next row with non-NaN 'Open' exists
        if next_valid_index < len(filtered_data):
            # Add the sentiment_numeric from the valid next day to the aggregation
            sentiment_sum += filtered_data.iloc[next_valid_index]['sentiment_numeric']
            likes_sum += filtered_data.iloc[next_valid_index]['likes_count']
            retweets_sum += filtered_data.iloc[next_valid_index]['retweets_count']
            replies_sum += filtered_data.iloc[next_valid_index]['replies_count']
            count += 1
            
            # Set the sentiment_numeric value to the mean of the aggregated values
            filtered_data.iloc[next_valid_index, filtered_data.columns.get_loc('sentiment_numeric')] = sentiment_sum / count
            # Set the sums for the other columns
            filtered_data.iloc[next_valid_index, filtered_data.columns.get_loc('likes_count')] = likes_sum
            filtered_data.iloc[next_valid_index, filtered_data.columns.get_loc('retweets_count')] = retweets_sum
            filtered_data.iloc[next_valid_index, filtered_data.columns.get_loc('replies_count')] = replies_sum
        
        # Drop all NaN rows from the DataFrame (we don't need them anymore)
        filtered_data = filtered_data.drop(filtered_data.index[index:next_valid_index])
        
        # Continue from the next valid index
        index = next_valid_index
    else:
        index += 1


In [33]:
filtered_data.head(50)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_numeric,likes_count,retweets_count,replies_count
13,2015-01-30,13.6,13.83,13.53,13.57,13.57,45105000.0,-1.0,796.0,462.0,243.0
14,2015-02-02,13.6,14.13,13.55,14.06,14.06,62238000.0,0.0,0.0,0.0,0.0
15,2015-02-03,14.21,14.69,14.08,14.56,14.56,72393000.0,0.0,0.0,0.0,0.0
16,2015-02-04,14.55,14.77,14.45,14.57,14.57,49581000.0,0.0,0.0,0.0,0.0
17,2015-02-05,14.66,15.03,14.64,14.73,14.73,52843500.0,0.0,0.0,0.0,0.0
18,2015-02-06,14.8,14.89,14.43,14.49,14.49,48658500.0,0.0,0.0,0.0,0.0
20,2015-02-09,14.36,14.53,14.13,14.5,14.5,52086000.0,0.0,8461.0,6111.0,899.0
21,2015-02-10,14.5,14.7,14.33,14.42,14.42,80857500.0,-1.0,2321.0,1580.0,550.0
22,2015-02-11,14.15,14.32,13.82,14.19,14.19,146536500.0,0.0,20254.0,10893.0,1773.0
23,2015-02-12,12.9,13.54,12.89,13.53,13.53,234744000.0,-1.0,2660.0,1467.0,244.0


In [28]:
# Calculate the number of zeros and non-zeros in 'likes_count' column
likes_count_zeros = (filtered_data['likes_count'] == 0).sum()
likes_count_non_zeros = (filtered_data['likes_count'] != 0).sum()

# Calculate the total number of rows
total_likes_count = len(filtered_data)

# Calculate percentages
likes_count_zeros_percentage = (likes_count_zeros / total_likes_count) * 100
likes_count_non_zeros_percentage = (likes_count_non_zeros / total_likes_count) * 100

# Display the results
print(f"Number of rows with 0 likes_count: {likes_count_zeros}")
print(f"Number of rows with non-zero likes_count: {likes_count_non_zeros}")
print(f"Percentage of rows with 0 likes_count: {likes_count_zeros_percentage:.2f}%")
print(f"Percentage of rows with non-zero likes_count: {likes_count_non_zeros_percentage:.2f}%")


Number of rows with 0 likes_count: 356
Number of rows with non-zero likes_count: 1018
Percentage of rows with 0 likes_count: 25.91%
Percentage of rows with non-zero likes_count: 74.09%


In [29]:
filtered_data.describe()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sentiment_numeric,likes_count,retweets_count,replies_count
count,1374,1373.0,1373.0,1373.0,1373.0,1373.0,1373.0,1374.0,1374.0,1374.0,1374.0
mean,2017-10-21 08:20:57.641921536,21.18,21.61,20.75,21.2,21.2,116025499.27,-0.0,126616.68,15428.3,3466.76
min,2015-01-30 00:00:00,9.49,10.33,9.4,9.58,9.58,10620000.0,-1.0,0.0,0.0,0.0
25%,2016-06-10 18:00:00,15.1,15.31,14.81,15.06,15.06,62526000.0,-0.2,0.0,0.0,0.0
50%,2017-10-19 12:00:00,17.99,18.33,17.73,18.01,18.01,89580000.0,0.0,17087.5,2489.5,772.0
75%,2019-03-04 18:00:00,22.37,22.82,22.0,22.34,22.34,134556000.0,0.19,107996.75,13109.0,3385.5
max,2020-07-14 00:00:00,110.6,119.67,98.07,102.98,102.98,914082000.0,1.0,2829985.0,397381.0,98612.0
std,,11.47,11.9,11.03,11.49,11.49,87168059.56,0.49,274839.22,36955.91,7840.6


In [34]:
filtered_data.to_csv('data/filtered_merged_data.csv', index=False)

In [35]:
filtered_data.to_pickle('data/filtered_merged_data.pkl')