# Tesla Stock Prediction Preprocessing

Preprocessing has moved to this separate file for added efficiency. Preprocessing is done and then the resulting dataframe is saved to a file so that it doesn't need to be repeated unless changes are made to the preprocessing process.

I use Trump and Elon Musk tweet data as part of the prediction. I did some low effort sentiment analysis using the nlkt Python library (nltk = Natural Language Toolkit) and the VADER pre-trained sentiment analysis tool. I also make my own attempt at analysis of the tweets by seeing whether they contain some Tesla-related keywords. In addition I use Google Trends data for the keyword "Tesla" as another feature. The result of this preprocessing is saved as "preprocessed.csv"

In [49]:
import pandas as pd
import numpy as np

In [50]:
# Tesla stock data from Yahoo Finance
df_tsla = pd.read_csv("TSLA_MoreRecent.csv", parse_dates=['Date'], index_col=0)
print(df_tsla.shape)
df_tsla.head()

(2897, 6)


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-07-01,5.0,5.184,4.054,4.392,4.392,41094000
2010-07-02,4.6,4.62,3.742,3.84,3.84,25699000
2010-07-06,4.0,4.0,3.166,3.222,3.222,34334500
2010-07-07,3.28,3.326,2.996,3.16,3.16,34608500
2010-07-08,3.228,3.504,3.114,3.492,3.492,38557000


In [51]:
df_GM = pd.read_csv("GM.csv", parse_dates=['Date'], index_col=0)
print(df_GM.shape)
df_GM.head()

(2798, 6)


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-11-19,34.150002,34.5,33.110001,34.259998,26.479338,107842000
2010-11-22,34.200001,34.48,33.810001,34.080002,26.340218,36650600
2010-11-23,33.950001,33.990002,33.189999,33.25,25.698721,31170200
2010-11-24,33.73,33.799999,33.220001,33.48,25.876482,26138000
2010-11-26,33.41,33.810001,33.209999,33.799999,26.123804,12301200


In [52]:
# Google Trends Data for the keyword "Tesla, Inc."
# Explanation of columns copied from dataDownload.py:
"""
    Contains 4 columns.
            The column named after the word argument contains the daily search
            volume already scaled and comparable through time.
            The column f'{word}_unscaled' is the original daily data fetched
            month by month, and it is not comparable across different months
            (but is comparable within a month).
            The column f'{word}_monthly' contains the original monthly data
            fetched at once. The values in this column have been backfilled
            so that there are no NaN present.
            The column 'scale' contains the scale used to obtain the scaled
            daily data.
"""
# We only need the "Tesla, Inc." column, which is scaled across the various months. Renamed "Tesla Trend" for clarity

df_teslaIncTrend = pd.read_csv("teslaIncTrend_MoreRecent.csv", parse_dates=['date'], index_col=0)
print(df_teslaIncTrend.shape)
df_teslaIncTrend.drop(columns=['Tesla, Inc._unscaled', 'Tesla, Inc._monthly', 'isPartial', 'scale'], axis=1, inplace=True)
df_teslaIncTrend.rename(columns={'Tesla, Inc.': 'Tesla Inc Trend'}, inplace=True)
df_teslaIncTrend.head()

(1461, 5)


Unnamed: 0_level_0,Tesla Inc Trend
date,Unnamed: 1_level_1
2018-01-01,
2018-01-02,
2018-01-03,
2018-01-04,
2018-01-05,


In [53]:
# Google Trends Data for the keyword "Tesla"

df_tslaTrend = pd.read_csv("teslaTrend_MoreRecent.csv", parse_dates=['date'], index_col=0)
print(df_tslaTrend.shape)
df_tslaTrend.drop(columns=['Tesla_unscaled', 'Tesla_monthly', 'isPartial', 'scale'], axis=1, inplace=True)
df_tslaTrend.rename(columns={'Tesla': 'Tesla Trend'}, inplace=True)
df_tslaTrend.head()

(4202, 5)


Unnamed: 0_level_0,Tesla Trend
date,Unnamed: 1_level_1
2010-07-01,6.0
2010-07-02,4.26
2010-07-03,2.82
2010-07-04,2.52
2010-07-05,2.28


In [54]:
# Google Trends Data for the keyword "Musk"

df_muskTrend = pd.read_csv("muskTrend_MoreRecent.csv", parse_dates=['date'], index_col=0)
print(df_muskTrend.shape)
df_muskTrend.drop(columns=['Elon Musk_unscaled', 'Elon Musk_monthly', 'isPartial', 'scale'], axis=1, inplace=True)
df_muskTrend.rename(columns={'Elon Musk': 'Musk Trend'}, inplace=True)
df_muskTrend.head()

(4202, 5)


Unnamed: 0_level_0,Musk Trend
date,Unnamed: 1_level_1
2010-07-01,0.0
2010-07-02,0.0
2010-07-03,0.0
2010-07-04,0.0
2010-07-05,0.0


In [55]:
df_GMTrend = pd.read_csv("GMTrend_MoreRecent.csv", parse_dates=['date'], index_col=0)
print(df_GMTrend.shape)
df_GMTrend.drop(columns=['GM_unscaled', 'GM_monthly', 'isPartial', 'scale'], axis=1, inplace=True)
df_GMTrend.rename(columns={'GM': 'GM Trend'}, inplace=True)
df_GMTrend.head()

(2557, 5)


Unnamed: 0_level_0,GM Trend
date,Unnamed: 1_level_1
2015-01-01,48.91
2015-01-02,73.0
2015-01-03,55.48
2015-01-04,48.18
2015-01-05,56.94


In [56]:
df_EVTrend = pd.read_csv("EVTrend_MoreRecent.csv", parse_dates=['date'], index_col=0)
print(df_EVTrend.shape)
df_EVTrend.drop(columns=['Electric vehicle_unscaled', 'Electric vehicle_monthly', 'isPartial', 'scale'], axis=1, inplace=True)
df_EVTrend.rename(columns={'Electric vehicle': 'EV Trend'}, inplace=True)
df_EVTrend.head()

(2557, 5)


Unnamed: 0_level_0,EV Trend
date,Unnamed: 1_level_1
2015-01-01,8.14
2015-01-02,14.74
2015-01-03,7.26
2015-01-04,2.86
2015-01-05,9.68


In [57]:
df_CoronavirusTrend = pd.read_csv("CoronavirusTrend_MoreRecent.csv", parse_dates=['date'], index_col=0)
print(df_CoronavirusTrend.shape)
df_CoronavirusTrend.drop(columns=['Coronavirus_unscaled', 'Coronavirus_monthly', 'isPartial', 'scale'], axis=1, inplace=True)
df_CoronavirusTrend.rename(columns={'Coronavirus': 'Coronavirus Trend'}, inplace=True)
df_CoronavirusTrend.head()

(2557, 5)


Unnamed: 0_level_0,Coronavirus Trend
date,Unnamed: 1_level_1
2015-01-01,0.0
2015-01-02,0.0
2015-01-03,0.0
2015-01-04,0.0
2015-01-05,0.0


## Preparing Data

### Trimming and merging dataframes

In [58]:
# Merging Two Data Sets Together
df = df_tsla.merge(df_tslaTrend, how='outer', left_index=True, right_index=True)
#df = df.merge(df_teslaIncTrend, how='outer', left_index=True, right_index=True)
df = df.merge(df_muskTrend, how='outer', left_index=True, right_index=True)
df = df.merge(df_GMTrend, how='outer', left_index=True, right_index=True)
df = df.merge(df_EVTrend, how='outer', left_index=True, right_index=True)
df = df.merge(df_CoronavirusTrend, how='outer', left_index=True, right_index=True)

# Rows with NaN values for Open, High, Low, etc will be removed later by the series_to_supervised function
df.index.name='Date'
df.dropna(inplace=True)
df['Month'] = pd.DatetimeIndex(df.index).month
df['Day of the Month'] = pd.DatetimeIndex(df.index).day
df['Day of the Week'] = pd.DatetimeIndex(df.index).dayofweek
df['Year'] = pd.DatetimeIndex(df.index).year
print(df.shape)
df.head()

(1763, 15)


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Tesla Trend,Musk Trend,GM Trend,EV Trend,Coronavirus Trend,Month,Day of the Month,Day of the Week,Year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-01-02,44.574001,44.650002,42.652,43.862,43.862,23822000.0,7.6,2.35,73.0,14.74,0.0,1,2,4,2015
2015-01-05,42.91,43.299999,41.431999,42.018002,42.018002,26842500.0,6.46,0.9,56.94,9.68,0.0,1,5,0,2015
2015-01-06,42.012001,42.84,40.841999,42.256001,42.256001,31309500.0,8.17,5.0,66.43,11.88,0.0,1,6,1,2015
2015-01-07,42.669998,42.956001,41.956001,42.189999,42.189999,14842000.0,7.79,1.4,62.78,20.46,0.0,1,7,2,2015
2015-01-08,42.562,42.759998,42.001999,42.124001,42.124001,17212500.0,9.31,0.8,64.97,19.14,0.0,1,8,3,2015


In [59]:
df.to_csv('preprocessedTesla_MoreRecent.csv')