In [1]:
import pandas as pd
import numpy as np
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error

# For each ticker (company), store all news into a dataframe

In [2]:
# Return one season's financial news dataframe
def season_dataframe(ticker, season):
    with open('./financial_news/'+ticker+str(season), 'r') as ticker_news:
        content  = ticker_news.read()
    records = json.loads(content)
    df = pd.DataFrame(records)
    return df

In [3]:
# Create dataframe for each ticker, storing its news
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$VRSN", "$GRMN", "$ANET", "$AAL"]
ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$GRMN", "$ANET", "$AAL"]
#ALL_TICKERS = ["$TSLA"]
df = []
for ticker in ALL_TICKERS:
  # Concatenate 8 seasons into one dataframe
  df_temp = pd.DataFrame()
  for i in range(1, 9):
      new_df = season_dataframe(ticker, i)
      df_temp = pd.concat([df_temp, new_df], ignore_index=True)
  df.append(df_temp)

Usage: df[0] will return $TSLA news from 2019-01-01 to 2020-12-31

# Calculate Polarity of each news, using nltk.sentiment.vader package

In [4]:
# Calculate vader
vader = SentimentIntensityAnalyzer()

# Helper function, which calculates the sentiment and returns compund score
def cal_compound(t):
    return vader.polarity_scores(t)["compound"]

In [5]:
for each_df in df:
  each_df['title_compound'] = each_df['title'].apply(cal_compound)
  each_df['text_compound'] = each_df['text'].apply(cal_compound)

Convert publishedDate to YYYY-MM-DD

In [6]:
# Helper function: for the date conversion
def remove_time(publish_date):
  return publish_date[0:10]

In [7]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(remove_time)

Convert weekend to next Monday

In [8]:
# Helper function: Convert weekend to next Monday
def moveWeekend(publish_date):
  d = datetime.datetime(int(publish_date[0:4]), int(publish_date[5:7]), int(publish_date[8:10]))
  if (d.weekday() == 5):
    return str(d+datetime.timedelta(days=2))[0:10]
  elif (d.weekday() == 6):
    return str(d+datetime.timedelta(days=1))[0:10]
  else:
    return publish_date

In [9]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(moveWeekend)

# Calculate title and text's daily polarity mean respectively

In [10]:
polarity = []
# Calcuate mean on daily basis
for each_df in df:
  each_title_mean = each_df.groupby('publishedDate', as_index=False)['title_compound'].mean()
  each_text_mean = each_df.groupby('publishedDate', as_index=False)['text_compound'].mean()
  each_polarity = pd.merge(each_title_mean, each_text_mean, on='publishedDate')
  each_polarity['symbol'] = each_df['symbol']
  polarity.append(each_polarity)

# Merge everyday's closing price with polarity dataframe

In [11]:
# Helper function: Fetch closing price by a specific date and ticker name
def fetchClosingPrice(time, ticker):
  data = pd.read_csv("./stock_price/"+ticker+".csv")
  data = data[data.Date.isin([time])]
  data = data['Close']
  #return data.iloc[0].item()
  return 0 if len(data.index) == 0 else data.iloc[0].item()

In [12]:
for each_polarity in polarity:
  each_polarity['close_price'] = each_polarity['publishedDate'].apply(fetchClosingPrice, args=(each_polarity.symbol.iloc[0],))

# Add S&P 500 Adj Close into dataframe

In [13]:
sp_df = pd.read_csv("./stock_price/S&P500.csv")
sp_df

Unnamed: 0,Date,Open,High,Low,Close
0,12-31-2020,3733.27,3760.20,3726.88,3756.07
1,12-30-2020,3736.19,3744.63,3730.21,3732.04
2,12-29-2020,3750.01,3756.12,3723.31,3727.04
3,12-28-2020,3723.03,3740.51,3723.03,3735.36
4,12-24-2020,3694.03,3703.82,3689.32,3703.06
...,...,...,...,...,...
500,01-08-2019,2568.11,2579.82,2547.56,2574.41
501,01-07-2019,2535.61,2566.16,2524.56,2549.69
502,01-04-2019,2474.33,2538.07,2474.33,2531.94
503,01-03-2019,2491.92,2493.14,2443.96,2447.89


In [14]:
def reorder_date(date):
  return date[6:10]+'-'+date[0:5]

In [15]:
sp_df['publishedDate'] = sp_df['Date'].apply(reorder_date)
sp_df = sp_df[['publishedDate', 'Close']]

In [16]:
sp_df = sp_df.iloc[::-1]

In [17]:
sp_df = sp_df.rename(columns= {'Close': "s&p500_close_price"}, inplace=False)

In [18]:
# Convert str to float
sp_df['s&p500_close_price'] = pd.to_numeric(sp_df['s&p500_close_price'], downcast="float")

In [20]:
for i in range(len(polarity)):
  polarity[i] = pd.merge(polarity[i], sp_df, on="publishedDate")


# Training and Testing Step

In [22]:
for each_polarity in polarity:
  y = each_polarity['close_price']
  X = each_polarity.drop(columns=['close_price','publishedDate','symbol'])
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)
  
  # apply RandomForestRegressor
  out_date = each_polarity.publishedDate.values
  rf = RandomForestRegressor(n_estimators=150)
  rf.fit(X_train, y_train)
  y_pred = rf.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  corr = np.corrcoef(X_train, y_train, rowvar=False)[-1, :-1]
  corr = corr.max()
  f = open(each_polarity.symbol.iloc[0]+".summary.csv", "w")
  f.write("MSE,correlation\n")
  f.write("{},{:.2f}\n".format(mse, corr))
  f.close()

  f = open(each_polarity.symbol.iloc[0]+".output.csv", "w")
  f.write("date,predicted_closing_price\n")
  for i in range(y_pred.shape[0]):
      f.write("{},{}\n".format(out_date[i], y_pred[i]))
  f.close()