In [1]:
import pandas as pd
import numpy as np
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error

In [2]:
# Return one season's financial news dataframe
def season_dataframe(ticker, season):
    with open('./financial_news/'+ticker+str(season), 'r') as ticker_news:
        content  = ticker_news.read()
    records = json.loads(content)
    df = pd.DataFrame(records)
    return df

In [None]:
# Testing CARR
# Concatenate 8 seasons into one dataframe
df_carr = pd.DataFrame()
for i in range(1, 9):
    new_df = season_dataframe("$CARR", i)
    df_carr = pd.concat([df_carr, new_df], ignore_index=True)
df_carr

In [9]:
# Predict ALL
ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$VRSN", "$GRMN", "$ANET", "$AAL"]
df = []
for ticker in ALL_TICKERS:
  # Concatenate 8 seasons into one dataframe
  df_temp = pd.DataFrame()
  for i in range(1, 9):
      new_df = season_dataframe(ticker, i)
      df_temp = pd.concat([df_temp, new_df], ignore_index=True)
  df.append(df_temp)

In [27]:
df[0]

Unnamed: 0,symbol,publishedDate,title,image,site,text,url
0,TSLA,2020-12-31 23:04:00,Tesla to deliver China-made Model Y SUVs this ...,https://cdn.snapi.dev/images/v1/5/m/m02d202101...,Reuters,Tesla Inc said on Friday it has started sellin...,https://www.reuters.com/article/us-tesla-china...
1,TSLA,2020-12-31 21:44:01,2020: Several Chinese Stocks Outperformed Thei...,https://cdn.snapi.dev/images/v1/f/j/catalog-ma...,Seeking Alpha,2020: Several Chinese Stocks Outperformed Thei...,https://seekingalpha.com/article/4396892-2020-...
2,TSLA,2020-12-31 16:39:55,EV Company News For The Month Of December 2020,https://cdn.snapi.dev/images/v1/l/r/sssik22-c5...,Seeking Alpha,Global electric car sales records for November...,https://seekingalpha.com/article/4396884-ev-co...
3,TSLA,2020-12-31 16:34:55,"Tesla, Volkswagen, Renault See Strong Share In...",https://cdn.snapi.dev/images/v1/v/x/s3xy-14.jpg,Benzinga,The European market continues to see strong ad...,https://www.benzinga.com/news/20/12/18973120/t...
4,TSLA,2020-12-31 16:29:13,Tech's top seven companies added $3.4 trillion...,https://cdn.snapi.dev/images/v1/s/t/stocks23-1...,CNBC,"Big Tech got much bigger in 2020, and Tesla jo...",https://www.cnbc.com/2020/12/31/techs-top-seve...
...,...,...,...,...,...,...,...
5817,TSLA,2019-01-18 19:00:00,Behind Elon Musk's Hiring and Firing Spree,https://cdn.snapi.dev/images/v1/v/i/viwtqd6wkp...,Bloomberg Technology,Elon Musk is cutting Tesla Inc.'s workforce by...,https://www.youtube.com/watch?v=ViwTqD6WKpA
5818,TSLA,2019-01-17 19:00:00,Tesla to cut full time workforce by roughly 7%...,https://cdn.snapi.dev/images/v1/t/8/t8cnbozbfx...,CNBC Television,The Wall Street Journal is reporting that Tesl...,https://www.youtube.com/watch?v=t8CNBOZBFXc
5819,TSLA,2019-01-06 19:00:00,Cramer: New Tesla factory in Shanghai will wor...,https://cdn.snapi.dev/images/v1/w/g/wgegqwogy8...,CNBC Television,CNBC's Jim Cramer discusses his take on the la...,https://www.youtube.com/watch?v=WGEgQWogY8E
5820,TSLA,2019-01-03 19:00:00,Tesla stock drops over missed delivery estimat...,https://cdn.snapi.dev/images/v1/b/e/bebskx74-2...,Fox Business,“Bulls & Bears” panel discusses how Tesla shar...,https://www.youtube.com/watch?v=Bebskx74-2Y


In [None]:
# Polarity

vader = SentimentIntensityAnalyzer()

In [None]:
# Helper function, which calculates the sentiment and returns compund score
def cal_compound(t):
    return vader.polarity_scores(t)["compound"]

In [None]:
# Calculate polarity
df_carr['title_compound'] = df_carr['title'].apply(cal_compound)
df_carr['text_compound'] = df_carr['text'].apply(cal_compound)

In [None]:
df_carr

In [None]:
# Group date

In [None]:
def remove_time(publish_date):
    return publish_date[0:10]

In [None]:
# Remove specific time of publishedDate column
df_carr['publishedDate'] = df_carr['publishedDate'].apply(remove_time)

In [None]:
df_carr

In [None]:
df_carr.head(20)

In [None]:
# Convert weekend to next Monday
def moveWeekend(publish_date):
  d = datetime.datetime(int(publish_date[0:4]), int(publish_date[5:7]), int(publish_date[8:10]))
  if (d.weekday() == 5):
    return str(d+datetime.timedelta(days=2))[0:10]
  elif (d.weekday() == 6):
    return str(d+datetime.timedelta(days=1))[0:10]
  else:
    return publish_date
  

In [None]:
# Check if date is weekend, if on weekend, then convert the publishedDate to next Monday
df_carr['publishedDate'] = df_carr['publishedDate'].apply(moveWeekend)
#df_carr['publishedDate'].apply(moveWeekend)

In [None]:
df_carr.head(20)

In [None]:
title_mean = df_carr.groupby('publishedDate', as_index=False)['title_compound'].mean()
text_mean = df_carr.groupby('publishedDate', as_index=False)['text_compound'].mean()

In [None]:
polarity = pd.merge(title_mean, text_mean, on='publishedDate')

In [None]:
polarity

In [None]:
polarity['symbol'] = df_carr['symbol']
polarity

# Merge everyday's closing price with polarity dataframe

In [None]:
# Fetch closing price by a specific date and ticker name
def fetchClosingPrice(time, ticker):
  data = pd.read_csv("./stock_price/"+ticker+".csv")
  data = data[data.Date.isin([time])]
  data = data['Close']
  #return data.iloc[0].item()
  return 0 if len(data.index) == 0 else data.iloc[0].item()

In [None]:
publishedDate = polarity['publishedDate']
publishedDate

In [None]:
polarity['close_price'] = polarity['publishedDate'].apply(fetchClosingPrice, args=(ticker,))
polarity

Split the data (i.e. polarity) into training and testing sets

In [None]:
y = polarity['close_price']

In [None]:
X = polarity.drop(columns=['close_price','publishedDate','symbol'])
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.33,
  random_state=0
)
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
out_date = polarity.publishedDate.values
rf = RandomForestRegressor(n_estimators=150)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
corr = np.corrcoef(X_train, y_train, rowvar=False)[-1, :-1]
corr = corr.max()
f = open("carr.summary.csv", "w")
f.write("MSE,correlation\n")
f.write("{},{:.2f}\n".format(mse, corr))
f.close()

f = open("carr.output.csv", "w")
f.write("movie_id,predicted_closing_price\n")
for i in range(y_pred.shape[0]):
    f.write("{},{}\n".format(out_date[i], y_pred[i]))
f.close()