In [64]:
import pandas as pd
import numpy as np
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error

# For each ticker (company), store all news into a dataframe

In [65]:
# Return one season's financial news dataframe
def season_dataframe(ticker, season):
    with open('./financial_news_long_period/'+ticker+str(season), 'r') as ticker_news:
        content  = ticker_news.read()
    records = json.loads(content)
    df = pd.DataFrame(records)
    return df

In [66]:
# Create dataframe for each ticker, storing its news
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$VRSN", "$GRMN", "$ANET", "$AAL"]
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$GRMN", "$ANET", "$AAL"]
ALL_TICKERS = ["$TSLA", "$INTC", "$PFE", "$SPGI", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$GRMN", "$AAL", "$ANET"]
df = []
for ticker in ALL_TICKERS:
  # Concatenate 8 seasons into one dataframe
  df_temp = pd.DataFrame()
  for i in range(1, 13):
      new_df = season_dataframe(ticker, i)
      df_temp = pd.concat([df_temp, new_df], ignore_index=True)
  df.append(df_temp)

In [67]:
df[6]

Unnamed: 0,symbol,publishedDate,title,image,site,text,url
0,TWTR,2020-12-29 09:30:00,Does Twitter Have a User Growth Problem?,https://cdn.snapi.dev/images/v1/4/p/urlhttps3a...,The Motley Fool,"Near-term user growth may be disappointing, bu...",https://www.fool.com/investing/2020/12/29/does...
1,TWTR,2020-12-29 08:55:08,The source behind the New York Post's dubious ...,https://cdn.snapi.dev/images/v1/t/h/the-source...,Business Insider,The laptop repairman at the center of the New ...,https://www.businessinsider.com/hunter-biden-l...
2,TWTR,2020-12-29 08:29:48,"Apple, Boeing, Nike, Airbnb, DoorDash and More...",https://cdn.snapi.dev/images/v1/s/t/stocks1---...,24/7 Wall Street,"This has been the year of the virus, and like ...",https://247wallst.com/investing/2020/12/29/app...
3,TWTR,2020-12-28 21:00:47,Twitter Sued By Owner Of Repair Shop Who Claim...,https://cdn.snapi.dev/images/v1/t/w/twitter-lo...,Deadline,The owner of a Mac repair shop filed a $500 mi...,https://deadline.com/2020/12/twitter-hunter-bi...
4,TWTR,2020-12-28 14:20:05,Twitter: Periscope Debacle,https://cdn.snapi.dev/images/v1/j/v/fbt2-2.jpg,Seeking Alpha,Twitter plans to shut down Periscope at the en...,https://seekingalpha.com/article/4396497-twitt...
...,...,...,...,...,...,...,...
1857,TWTR,2018-02-28 19:00:00,Jack Dorsey Looks to Tackle Twitter's Toxicity,https://cdn.snapi.dev/images/v1/w/g/wgnuhklea2...,Bloomberg Technology,Bloomberg's Selina Wang discusses Twitter CEO ...,https://www.youtube.com/watch?v=WgNUHklEa20
1858,TWTR,2018-02-22 19:00:00,Twitter Exec Laura Froelich On The Future Of S...,https://cdn.snapi.dev/images/v1/d/s/dsceqz0iwp...,CNBC,"Laura Froelich, Twitter's global director of s...",https://www.youtube.com/watch?v=DsCEQZ0iwPw
1859,TWTR,2018-02-20 19:00:00,Conservative Twitter Users Accuse Company of Bias,https://cdn.snapi.dev/images/v1/b/2/b2xsb1d4fz...,Bloomberg Technology,Bloomberg's Julie Verhage and Sarah Frier disc...,https://www.youtube.com/watch?v=B2xsB1d4FZE
1860,TWTR,2018-02-19 19:00:00,Pressure's on Tech Companies to Tackle Russian...,https://cdn.snapi.dev/images/v1/b/a/bas1axl9o6...,Bloomberg Technology,"Tara Maller, senior policy advisor at the Coun...",https://www.youtube.com/watch?v=BAS1AXl9O64


Usage: df[0] will return $TSLA news from 2018-01-01 to 2020-12-31

# Calculate Polarity of each news, using nltk.sentiment.vader package

In [68]:
# Calculate vader
vader = SentimentIntensityAnalyzer()

# Helper function, which calculates the sentiment and returns compund score
def cal_compound(t):
    return vader.polarity_scores(t)["compound"]

In [69]:
for each_df in df:
  each_df['title_compound'] = each_df['title'].apply(cal_compound)
  each_df['text_compound'] = each_df['text'].apply(cal_compound)

Convert publishedDate to YYYY-MM-DD

In [70]:
# Helper function: for the date conversion
def remove_time(publish_date):
  return publish_date[0:10]

In [71]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(remove_time)

Convert weekend to next Monday

In [72]:
# Helper function: Convert weekend to next Monday
def moveWeekend(publish_date):
  d = datetime.datetime(int(publish_date[0:4]), int(publish_date[5:7]), int(publish_date[8:10]))
  if (d.weekday() == 5):
    return str(d+datetime.timedelta(days=2))[0:10]
  elif (d.weekday() == 6):
    return str(d+datetime.timedelta(days=1))[0:10]
  else:
    return publish_date

In [73]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(moveWeekend)

In [74]:
df[6]

Unnamed: 0,symbol,publishedDate,title,image,site,text,url,title_compound,text_compound
0,TWTR,2020-12-29,Does Twitter Have a User Growth Problem?,https://cdn.snapi.dev/images/v1/4/p/urlhttps3a...,The Motley Fool,"Near-term user growth may be disappointing, bu...",https://www.fool.com/investing/2020/12/29/does...,-0.0258,0.5023
1,TWTR,2020-12-29,The source behind the New York Post's dubious ...,https://cdn.snapi.dev/images/v1/t/h/the-source...,Business Insider,The laptop repairman at the center of the New ...,https://www.businessinsider.com/hunter-biden-l...,-0.5574,-0.6808
2,TWTR,2020-12-29,"Apple, Boeing, Nike, Airbnb, DoorDash and More...",https://cdn.snapi.dev/images/v1/s/t/stocks1---...,24/7 Wall Street,"This has been the year of the virus, and like ...",https://247wallst.com/investing/2020/12/29/app...,0.0000,-0.2500
3,TWTR,2020-12-28,Twitter Sued By Owner Of Repair Shop Who Claim...,https://cdn.snapi.dev/images/v1/t/w/twitter-lo...,Deadline,The owner of a Mac repair shop filed a $500 mi...,https://deadline.com/2020/12/twitter-hunter-bi...,0.0000,-0.5574
4,TWTR,2020-12-28,Twitter: Periscope Debacle,https://cdn.snapi.dev/images/v1/j/v/fbt2-2.jpg,Seeking Alpha,Twitter plans to shut down Periscope at the en...,https://seekingalpha.com/article/4396497-twitt...,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...
1857,TWTR,2018-02-28,Jack Dorsey Looks to Tackle Twitter's Toxicity,https://cdn.snapi.dev/images/v1/w/g/wgnuhklea2...,Bloomberg Technology,Bloomberg's Selina Wang discusses Twitter CEO ...,https://www.youtube.com/watch?v=WgNUHklEa20,0.0000,0.4404
1858,TWTR,2018-02-22,Twitter Exec Laura Froelich On The Future Of S...,https://cdn.snapi.dev/images/v1/d/s/dsceqz0iwp...,CNBC,"Laura Froelich, Twitter's global director of s...",https://www.youtube.com/watch?v=DsCEQZ0iwPw,0.0000,0.0000
1859,TWTR,2018-02-20,Conservative Twitter Users Accuse Company of Bias,https://cdn.snapi.dev/images/v1/b/2/b2xsb1d4fz...,Bloomberg Technology,Bloomberg's Julie Verhage and Sarah Frier disc...,https://www.youtube.com/watch?v=B2xsB1d4FZE,-0.2960,-0.7430
1860,TWTR,2018-02-19,Pressure's on Tech Companies to Tackle Russian...,https://cdn.snapi.dev/images/v1/b/a/bas1axl9o6...,Bloomberg Technology,"Tara Maller, senior policy advisor at the Coun...",https://www.youtube.com/watch?v=BAS1AXl9O64,0.0000,-0.4404


## Save news to files (need to run once)

In [12]:
# for each_df in df:
#   each_df.to_csv(each_df.symbol[0]+'.csv', index=False)

# Calculate title and text's daily polarity mean respectively

In [75]:
polarity = []
# Calcuate mean on daily basis
for each_df in df:
  each_title_mean = each_df.groupby('publishedDate', as_index=False)['title_compound'].mean()
  each_text_mean = each_df.groupby('publishedDate', as_index=False)['text_compound'].mean()
  each_polarity = pd.merge(each_title_mean, each_text_mean, on='publishedDate')
  each_polarity['symbol'] = each_df['symbol']
  each_polarity['title'] = each_df['title']
  each_polarity['text'] = each_df['text']
  polarity.append(each_polarity)

In [76]:
polarity[0]

Unnamed: 0,publishedDate,title_compound,text_compound,symbol,title,text
0,2018-12-24,0.177900,0.000000,TSLA,Tesla to deliver China-made Model Y SUVs this ...,Tesla Inc said on Friday it has started sellin...
1,2018-12-25,0.421500,0.296000,TSLA,2020: Several Chinese Stocks Outperformed Thei...,2020: Several Chinese Stocks Outperformed Thei...
2,2018-12-26,0.000000,0.296000,TSLA,EV Company News For The Month Of December 2020,Global electric car sales records for November...
3,2018-12-27,0.296000,-0.153100,TSLA,"Tesla, Volkswagen, Renault See Strong Share In...",The European market continues to see strong ad...
4,2019-01-01,-0.226300,-0.296000,TSLA,Tech's top seven companies added $3.4 trillion...,"Big Tech got much bigger in 2020, and Tesla jo..."
...,...,...,...,...,...,...
433,2020-12-25,0.148000,0.509500,TSLA,"Tesla Leads EV Race, But These China Tech Upst...","Chinese electric vehicle stocks Nio, Li Auto, ..."
434,2020-12-28,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...
435,2020-12-29,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...
436,2020-12-30,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...


# Merge everyday's closing price with polarity dataframe, but show everyday's closing price

In [77]:
# Merge everyday's closing price with polarity dataframe
index = 0
for ticker in ALL_TICKERS:
  data = pd.read_csv("./stock_price/with_7day_ma_and7days_prediction/compare_previous_day/"+ticker[1:]+".csv")
  data = data[['Date', 'Close']]
  data['publishedDate'] = data['Date']
  data = data[['publishedDate', 'Close']]
  polarity[index] = pd.merge(data, polarity[index], on = 'publishedDate', how = 'left').fillna(0)
  index+=1


In [78]:
# Fill symbol column for each company
for index, each_polarity in enumerate(polarity):
  each_polarity['symbol'] = ALL_TICKERS[index][1:]

In [79]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text
0,2018-01-02,64.106003,0.000000,0.000000,TSLA,0,0
1,2018-01-03,63.450001,0.000000,0.000000,TSLA,0,0
2,2018-01-04,62.924000,0.000000,0.000000,TSLA,0,0
3,2018-01-05,63.316002,0.000000,0.000000,TSLA,0,0
4,2018-01-08,67.281998,0.000000,0.000000,TSLA,0,0
...,...,...,...,...,...,...,...
751,2020-12-24,661.770020,0.158990,0.276630,TSLA,Tesla files to sell $5B in stock while its sha...,Tesla is striking while its share price — and ...
752,2020-12-28,663.690002,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...
753,2020-12-29,665.989990,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...
754,2020-12-30,694.780029,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...


# Todo: fill title compound/text compound with 1.average all the time 2. average quarter compound etc... (check notes) !!!!!!!

## Calculate average compound for each company (for ALL time)

In [23]:
# Calculate average title compound for each company (for all time)
whole_period_title_compound_average = []
for each_polarity in polarity:
  whole_period_title_compound_average.append(each_polarity[each_polarity["title_compound"]!=0]["title_compound"].mean())

In [24]:
whole_period_title_compound_average

[0.03323769566195808,
 0.12931403181734172,
 0.13340494636969674,
 0.24973572068511193,
 0.2517968560059469,
 0.29102757731958745,
 0.016465804248072386,
 0.09800193358398342,
 0.2738122164502165,
 0.04089187715081693,
 0.17621849109224108]

In [25]:
# Calculate average text compound for each company (for all time)
whole_period_text_compound_average = []
for each_polarity in polarity:
  whole_period_text_compound_average.append(each_polarity[each_polarity["text_compound"]!=0]["text_compound"].mean())

In [26]:
whole_period_text_compound_average

[0.10407983754656298,
 0.20538976545816587,
 0.2530589825946846,
 0.4098589660277388,
 0.3595158229854941,
 0.38954471624266146,
 0.12247267760826815,
 0.21920362665391524,
 0.43225930985680994,
 0.09433116344041344,
 0.2495956969696969]

In [34]:
# Replace 0.00000 value with the average compound value
i = 0
for each_polarity in polarity:
  each_polarity_title_compound = each_polarity['title_compound']
  each_polarity_title_compound = each_polarity_title_compound.mask(each_polarity_title_compound==0).fillna(whole_period_title_compound_average[i])
  each_polarity['title_compound'] = each_polarity_title_compound

  each_polarity_text_compound = each_polarity['text_compound']
  each_polarity_text_compound = each_polarity_text_compound.mask(each_polarity_text_compound==0).fillna(whole_period_text_compound_average[i])
  each_polarity['text_compound'] = each_polarity_text_compound
  i+=1

In [42]:
polarity[1]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text,Close_scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6,s&p500_close_price
0,2018-01-02,46.849998,0.000000,0.000000,INTC,0,0,0.170058,0.0,0.170058,0.170058,0.170058,0.170058,0.170058,0.170058,2695.810059
1,2018-01-03,45.259998,0.000000,0.000000,INTC,0,0,0.109021,0.0,0.170058,0.170058,0.170058,0.170058,0.170058,0.170058,2713.060059
2,2018-01-04,44.430000,0.000000,0.000000,INTC,0,0,0.077159,0.0,0.109021,0.170058,0.109021,0.109021,0.109021,0.109021,2723.989990
3,2018-01-05,44.740002,0.000000,0.000000,INTC,0,0,0.089060,0.0,0.077159,0.109021,0.170058,0.077159,0.077159,0.077159,2743.149902
4,2018-01-08,44.740002,0.000000,0.000000,INTC,0,0,0.089060,0.0,0.089060,0.077159,0.109021,0.170058,0.089060,0.089060,2747.709961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,2020-12-24,47.070000,0.000000,0.000000,INTC,0,0,0.178503,0.0,0.159309,0.143954,0.151248,0.193474,0.315931,0.333973,3703.060059
752,2020-12-28,47.070000,0.130867,-0.035617,INTC,Intel Needs a New CEO to Remain Relevant,INTC stock won't be great again until CEO Robe...,0.178503,0.0,0.178503,0.159309,0.143954,0.151248,0.193474,0.315931,3735.360107
753,2020-12-29,49.389999,0.124417,0.166725,INTC,INTC CLASS ACTION NOTICE: Glancy Prongay & Mur...,LOS ANGELES--(BUSINESS WIRE)--Investors are he...,0.267562,0.0,0.178503,0.178503,0.159309,0.143954,0.151248,0.193474,3727.040039
754,2020-12-30,48.750000,0.055848,0.221378,INTC,Will AMD benefit from Intel's pain?,CNBC's Josh Lipton on AMD earnings and whether...,0.242994,1.0,0.267562,0.178503,0.178503,0.159309,0.143954,0.151248,3732.040039


## Calculate average compound for each company (quarterly) UNFINISHED

In [80]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text
0,2018-01-02,64.106003,0.000000,0.000000,TSLA,0,0
1,2018-01-03,63.450001,0.000000,0.000000,TSLA,0,0
2,2018-01-04,62.924000,0.000000,0.000000,TSLA,0,0
3,2018-01-05,63.316002,0.000000,0.000000,TSLA,0,0
4,2018-01-08,67.281998,0.000000,0.000000,TSLA,0,0
...,...,...,...,...,...,...,...
751,2020-12-24,661.770020,0.158990,0.276630,TSLA,Tesla files to sell $5B in stock while its sha...,Tesla is striking while its share price — and ...
752,2020-12-28,663.690002,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...
753,2020-12-29,665.989990,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...
754,2020-12-30,694.780029,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...


In [81]:
a=polarity[0]
a

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text
0,2018-01-02,64.106003,0.000000,0.000000,TSLA,0,0
1,2018-01-03,63.450001,0.000000,0.000000,TSLA,0,0
2,2018-01-04,62.924000,0.000000,0.000000,TSLA,0,0
3,2018-01-05,63.316002,0.000000,0.000000,TSLA,0,0
4,2018-01-08,67.281998,0.000000,0.000000,TSLA,0,0
...,...,...,...,...,...,...,...
751,2020-12-24,661.770020,0.158990,0.276630,TSLA,Tesla files to sell $5B in stock while its sha...,Tesla is striking while its share price — and ...
752,2020-12-28,663.690002,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...
753,2020-12-29,665.989990,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...
754,2020-12-30,694.780029,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...


In [62]:
a.at[0, 'publishedDate'] = "hello"
a

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text
0,hello,64.106003,0.000000,0.000000,TSLA,0,0
1,2018-01-03,63.450001,0.000000,0.000000,TSLA,0,0
2,2018-01-04,62.924000,0.000000,0.000000,TSLA,0,0
3,2018-01-05,63.316002,0.000000,0.000000,TSLA,0,0
4,2018-01-08,67.281998,0.000000,0.000000,TSLA,0,0
...,...,...,...,...,...,...,...
751,2020-12-24,661.770020,0.158990,0.276630,TSLA,Tesla files to sell $5B in stock while its sha...,Tesla is striking while its share price — and ...
752,2020-12-28,663.690002,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...
753,2020-12-29,665.989990,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...
754,2020-12-30,694.780029,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...


In [34]:
# for each_polarity in polarity:
for year in range(2018, 2021):
  for month in (1, 13):
    fill = []
    sum_title_compound = 0
    sum_text_compound = 0
    for index, row in a.iterrows():
      sum_title_compound = sum_title_compound + row['title_compound']
      sum_text_compound = sum_text_compound + row['text_compound']
      if (row['title_compound'] == 0):
        fill.append(index)
    avg_title_compound = sum_title_compound / 20
    avg_text_compound = sum_text_compound / 20
    for f in fill:
      a[]

publishedDate     2018-01-02
Close                 64.106
title_compound             0
text_compound              0
symbol                  TSLA
title                      0
text                       0
Name: 0, dtype: object
0


# Apply MinMax scaler to Close price

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [19]:
for each_polarity in polarity:
  close = each_polarity['Close']
  scaler = MinMaxScaler(feature_range=(0,1))
  close = scaler.fit_transform(np.array(close).reshape(-1,1))
  each_polarity['Close_scaler'] = close

In [20]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text,Close_scaler
0,2018-01-02,64.106003,0.000000,0.000000,TSLA,0,0,0.042265
1,2018-01-03,63.450001,0.000000,0.000000,TSLA,0,0,0.041285
2,2018-01-04,62.924000,0.000000,0.000000,TSLA,0,0,0.040500
3,2018-01-05,63.316002,0.000000,0.000000,TSLA,0,0,0.041085
4,2018-01-08,67.281998,0.000000,0.000000,TSLA,0,0,0.047006
...,...,...,...,...,...,...,...,...
751,2020-12-24,661.770020,0.158990,0.276630,TSLA,Tesla files to sell $5B in stock while its sha...,Tesla is striking while its share price — and ...,0.934466
752,2020-12-28,663.690002,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...,0.937332
753,2020-12-29,665.989990,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...,0.940765
754,2020-12-30,694.780029,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...,0.983743


# Merge everyday's indicator with polarity dataframe (7 days)

In [21]:
for each_polarity in polarity:
  data = pd.read_csv("./stock_price/with_7day_ma_and7days_prediction/compare_previous_day_7/"+each_polarity.symbol.loc[0]+".csv")
  each_polarity['indicator'] = data['indicator']

In [22]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text,Close_scaler,indicator
0,2018-01-02,64.106003,0.000000,0.000000,TSLA,0,0,0.042265,0.0
1,2018-01-03,63.450001,0.000000,0.000000,TSLA,0,0,0.041285,0.0
2,2018-01-04,62.924000,0.000000,0.000000,TSLA,0,0,0.040500,0.0
3,2018-01-05,63.316002,0.000000,0.000000,TSLA,0,0,0.041085,0.0
4,2018-01-08,67.281998,0.000000,0.000000,TSLA,0,0,0.047006,0.0
...,...,...,...,...,...,...,...,...,...
751,2020-12-24,661.770020,0.158990,0.276630,TSLA,Tesla files to sell $5B in stock while its sha...,Tesla is striking while its share price — and ...,0.934466,1.0
752,2020-12-28,663.690002,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...,0.937332,1.0
753,2020-12-29,665.989990,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...,0.940765,1.0
754,2020-12-30,694.780029,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...,0.983743,0.0


# Add previous 7 day's scaler to each row

In [23]:
for each_polarity in polarity:
  # Add scaler to first row
  for i in range(0, 6):
    name="scaler-"+str(i+1)
    each_polarity.loc[0, name] = each_polarity.loc[0, 'Close_scaler']
  # Add scaler to the remaining rows
  for i in range(1, len(each_polarity)):
    for s in range(0, 6):
      name="scaler-"+str(s+1)
      if (i-(s+1) >= 0):
        each_polarity.loc[i, name] = each_polarity.loc[i-(s+1), 'Close_scaler']
      else:
        each_polarity.loc[i, name] = each_polarity.loc[i-1, 'Close_scaler']

In [24]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text,Close_scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6
0,2018-01-02,64.106003,0.000000,0.000000,TSLA,0,0,0.042265,0.0,0.042265,0.042265,0.042265,0.042265,0.042265,0.042265
1,2018-01-03,63.450001,0.000000,0.000000,TSLA,0,0,0.041285,0.0,0.042265,0.042265,0.042265,0.042265,0.042265,0.042265
2,2018-01-04,62.924000,0.000000,0.000000,TSLA,0,0,0.040500,0.0,0.041285,0.042265,0.041285,0.041285,0.041285,0.041285
3,2018-01-05,63.316002,0.000000,0.000000,TSLA,0,0,0.041085,0.0,0.040500,0.041285,0.042265,0.040500,0.040500,0.040500
4,2018-01-08,67.281998,0.000000,0.000000,TSLA,0,0,0.047006,0.0,0.041085,0.040500,0.041285,0.042265,0.041085,0.041085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,2020-12-24,661.770020,0.158990,0.276630,TSLA,Tesla files to sell $5B in stock while its sha...,Tesla is striking while its share price — and ...,0.934466,1.0,0.910894,0.902475,0.916686,0.984072,0.925703,0.876246
752,2020-12-28,663.690002,0.081800,0.378638,TSLA,7 Top Electric Vehicle Stocks to Buy for 2021,Electric vehicle stocks have been storming hig...,0.937332,1.0,0.934466,0.910894,0.902475,0.916686,0.984072,0.925703
753,2020-12-29,665.989990,0.209004,0.202939,TSLA,Tesla retreats from record highs after announc...,Tesla shares retreated from all-time-highs on ...,0.940765,1.0,0.937332,0.934466,0.910894,0.902475,0.916686,0.984072
754,2020-12-30,694.780029,0.209600,0.323291,TSLA,Tesla's $5B stock offering taps skyrocketing v...,Tesla is capitalizing on a soaring share price...,0.983743,0.0,0.940765,0.937332,0.934466,0.910894,0.902475,0.916686


# Add S&P 500 Index

In [25]:
sp_df = pd.read_csv("./stock_price/with_7day_ma_and7days_prediction/original/S&P500.csv")
def reorder_date(date):
  return date[6:10]+'-'+date[0:5]
sp_df['publishedDate'] = sp_df['Date'].apply(reorder_date)
sp_df = sp_df[['publishedDate', 'Close']]
sp_df = sp_df.iloc[::-1]
sp_df = sp_df.rename(columns= {'Close': "s&p500_close_price"}, inplace=False)
# Convert str to float
sp_df['s&p500_close_price'] = pd.to_numeric(sp_df['s&p500_close_price'], downcast="float")
sp_df


Unnamed: 0,publishedDate,s&p500_close_price
755,2018-01-02,2695.810059
754,2018-01-03,2713.060059
753,2018-01-04,2723.989990
752,2018-01-05,2743.149902
751,2018-01-08,2747.709961
...,...,...
4,2020-12-24,3703.060059
3,2020-12-28,3735.360107
2,2020-12-29,3727.040039
1,2020-12-30,3732.040039


In [26]:
# Merge with polarity dataframe
for index, ep in enumerate(polarity):
  polarity[index] = pd.merge(ep, sp_df, on="publishedDate")

In [27]:
# NO NEED TO RUN
# a = polarity[1]
# a.drop(columns=['scaler-7', 'scaler-6']).tail(50)

In [27]:
polarity[1].tail(50)

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,title,text,Close_scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6,s&p500_close_price
706,2020-10-21,53.5,0.179068,0.232605,INTC,SK hynix could buy Intel NAND business – Block...,"The WSJ is reporting Korean DRAM, NAND and SSD...",0.425336,0.0,0.422649,0.466795,0.450672,0.438772,0.427255,0.438004,3435.560059
707,2020-10-22,53.900002,0.026352,0.14519,INTC,Intel and partners announce high-performance S...,Intel has been a leader in Ethernet networking...,0.440691,1.0,0.425336,0.422649,0.466795,0.450672,0.438772,0.427255,3453.48999
708,2020-10-23,48.200001,-0.015557,-0.015129,INTC,Intel Nears Deal to Sell NAND Memory Unit to S...,,0.221881,0.0,0.440691,0.425336,0.422649,0.466795,0.450672,0.438772,3465.389893
709,2020-10-26,46.720001,-0.16588,0.25834,INTC,Intel : Nears Deal to Sell NAND Memory Unit to...,,0.165067,0.0,0.221881,0.440691,0.425336,0.422649,0.466795,0.450672,3400.969971
710,2020-10-27,45.639999,-0.1865,0.204133,INTC,Intel's stock spikes into the green after WSJ ...,Shares of Intel Corp. undefined swung higher i...,0.123608,0.0,0.165067,0.221881,0.440691,0.425336,0.422649,0.466795,3390.679932
711,2020-10-28,44.25,-0.01015,0.145175,INTC,WSJ News Exclusive | Intel Nears Deal to Sell ...,The U.S. semiconductor giant is nearing a deal...,0.07025,0.0,0.123608,0.165067,0.221881,0.440691,0.425336,0.422649,3271.030029
712,2020-10-29,44.110001,0.210612,0.325688,INTC,Intel's Shares May Jump After Results Despite ...,Intel's stock may finally be recovering after ...,0.064875,0.0,0.07025,0.123608,0.165067,0.221881,0.440691,0.425336,3310.110107
713,2020-10-30,44.279999,0.4215,0.0,INTC,McAfee IPO: 5 things to know about the securit...,McAfee Corp. plans to return to the public mar...,0.071401,0.0,0.064875,0.07025,0.123608,0.165067,0.221881,0.440691,3269.959961
714,2020-11-02,44.459999,0.02254,0.12384,INTC,How Intel Malaysia Helped Usher in a New Era o...,"In this 3-part blog series, discover how Intel...",0.078311,0.0,0.071401,0.064875,0.07025,0.123608,0.165067,0.221881,3310.23999
715,2020-11-03,44.849998,0.0,0.0,INTC,Intel vs. AMD In Lenovo's IdeaPad Slim 7: A Cl...,We tested Intel Core vs. AMD Ryzen processors ...,0.093282,0.0,0.078311,0.071401,0.064875,0.07025,0.123608,0.165067,3369.159912


# Predict the trend using MLPClassifier model

In [28]:
from sklearn.neural_network import MLPClassifier
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier




In [29]:
@ignore_warnings(category=ConvergenceWarning)
def predict(polarity):
  avg_accuracy = 0
  for each_polarity in polarity:
    y = each_polarity['indicator']
    X = each_polarity.drop(columns=['indicator', 'publishedDate', 'symbol', 'title', 'text', 'Close'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=False)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # apply Random Forest Classifier
    """ out_date = each_polarity.publishedDate.values
    nn = RandomForestClassifier(n_estimators=200,criterion='entropy') """

    # apply passive aggressive classifier
    out_date = each_polarity.publishedDate.values
    nn = PassiveAggressiveClassifier()

    nn.fit(X_train, y_train)
    #rf.fit(X_train, y_train)
    y_pred = nn.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    avg_accuracy+=accuracy_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    f = open(each_polarity.symbol.iloc[0]+".summaryRandomForest.csv", "w")
    f.write("accuracy\n")
    f.write("{:.2f}\n".format(accuracy))
    f.close()

    f = open(each_polarity.symbol.iloc[0]+".outputRandomForest.csv", "w")
    f.write("date,predicted_indicator\n")
    dd = 506
    for i in range(y_pred.shape[0]):
      f.write("{},{}\n".format(out_date[dd], y_pred[i]))
      dd+=1
    f.close()
  print("AVERAGE ACCURACY: "+str(avg_accuracy/11))

In [31]:
# split train and test data in consecutive period
@ignore_warnings(category=ConvergenceWarning)
def predict_test(polarity):
  avg_accuracy = 0
  for each_polarity in polarity:
    y = each_polarity['indicator']
    X = each_polarity.drop(columns=['indicator', 'symbol', 'title', 'text'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=False)
    # print(len(X_test))
    # print(len(X_train))
    print(X_test)
    # scaler = StandardScaler()
    # scaler.fit(X_train)
    # X_train = scaler.transform(X_train)
    # X_test = scaler.transform(X_test)

    # # apply Random Forest Classifier
    # """ out_date = each_polarity.publishedDate.values
    # nn = RandomForestClassifier(n_estimators=200,criterion='entropy') """

    # # apply passive aggressive classifier
    # out_date = each_polarity.publishedDate.values
    # nn = PassiveAggressiveClassifier()

    # nn.fit(X_train, y_train)
    # #rf.fit(X_train, y_train)
    # y_pred = nn.predict(X_test)
    # print(accuracy_score(y_test, y_pred))
    # avg_accuracy+=accuracy_score(y_test, y_pred)
    # accuracy = accuracy_score(y_test, y_pred)
    # f = open(each_polarity.symbol.iloc[0]+".summaryRandomForest.csv", "w")
    # f.write("accuracy\n")
    # f.write("{:.2f}\n".format(accuracy))
    # f.close()

    # f = open(each_polarity.symbol.iloc[0]+".outputRandomForest.csv", "w")
    # f.write("date,predicted_indicator\n")
    # for i in range(y_pred.shape[0]):
    #   f.write("{},{}\n".format(out_date[i], y_pred[i]))
    # f.close()
    break
  print("AVERAGE ACCURACY: "+str(avg_accuracy/11))

In [32]:
len(polarity)

11

In [33]:
""" i = 0
for each_polarity in polarity:
    each_polarity = each_polarity.drop('title', axis=1)
    each_polarity = each_polarity.drop('text', axis=1)
    polarity[i] = each_polarity
    i+=1 """

# i = 0
# for each_polarity in polarity:
#     each_polarity.fillna(' ')
#     polarity[i] = each_polarity
#     i+=1

" i = 0\nfor each_polarity in polarity:\n    each_polarity = each_polarity.drop('title', axis=1)\n    each_polarity = each_polarity.drop('text', axis=1)\n    polarity[i] = each_polarity\n    i+=1 "

In [34]:
# i=0
# for each_polarity in polarity:
#     each_polarity = each_polarity.drop('scaler-7', axis=1)
#     polarity[i] = each_polarity
#     i+=1

In [35]:
# polarity[0]

In [36]:
# polarity[1].indicator.to_csv('testesttest.txt', index=False)


In [37]:
# predict_test(polarity)

In [41]:
predict(polarity)

0.732
0.736
0.836
0.624
0.812
0.66
0.72
0.888
0.712
0.672
0.648
AVERAGE ACCURACY: 0.7309090909090908
