In [1]:
import pandas as pd
import numpy as np
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error

# For each ticker (company), store all news into a dataframe

In [3]:
# Return one season's financial news dataframe
def season_dataframe(ticker, season):
    with open('./financial_news/'+ticker+str(season), 'r') as ticker_news:
        content  = ticker_news.read()
    records = json.loads(content)
    df = pd.DataFrame(records)
    return df

In [4]:
# Create dataframe for each ticker, storing its news
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$VRSN", "$GRMN", "$ANET", "$AAL"]
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$GRMN", "$ANET", "$AAL"]
ALL_TICKERS = ["$TSLA", "$PFE", "$INTC", "$SPGI", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$GRMN", "$ANET", "$AAL"]

df = []
for ticker in ALL_TICKERS:
  # Concatenate 8 seasons into one dataframe
  df_temp = pd.DataFrame()
  for i in range(1, 9):
      new_df = season_dataframe(ticker, i)
      df_temp = pd.concat([df_temp, new_df], ignore_index=True)
  df.append(df_temp)

Usage: df[0] will return $TSLA news from 2019-01-01 to 2020-12-31

# Calculate Polarity of each news, using nltk.sentiment.vader package

In [5]:
# Calculate vader
vader = SentimentIntensityAnalyzer()

# Helper function, which calculates the sentiment and returns compund score
def cal_compound(t):
    return vader.polarity_scores(t)["compound"]

In [6]:
for each_df in df:
  each_df['title_compound'] = each_df['title'].apply(cal_compound)
  each_df['text_compound'] = each_df['text'].apply(cal_compound)

Convert publishedDate to YYYY-MM-DD

In [7]:
# Helper function: for the date conversion
def remove_time(publish_date):
  return publish_date[0:10]

In [8]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(remove_time)

Convert weekend to next Monday

In [9]:
# Helper function: Convert weekend to next Monday
def moveWeekend(publish_date):
  d = datetime.datetime(int(publish_date[0:4]), int(publish_date[5:7]), int(publish_date[8:10]))
  if (d.weekday() == 5):
    return str(d+datetime.timedelta(days=2))[0:10]
  elif (d.weekday() == 6):
    return str(d+datetime.timedelta(days=1))[0:10]
  else:
    return publish_date

In [10]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(moveWeekend)

In [11]:
df[0]

Unnamed: 0,symbol,publishedDate,title,image,site,text,url,title_compound,text_compound
0,TSLA,2020-12-31,Tesla to deliver China-made Model Y SUVs this ...,https://cdn.snapi.dev/images/v1/5/m/m02d202101...,Reuters,Tesla Inc said on Friday it has started sellin...,https://www.reuters.com/article/us-tesla-china...,0.0000,0.1027
1,TSLA,2020-12-31,2020: Several Chinese Stocks Outperformed Thei...,https://cdn.snapi.dev/images/v1/f/j/catalog-ma...,Seeking Alpha,2020: Several Chinese Stocks Outperformed Thei...,https://seekingalpha.com/article/4396892-2020-...,0.0000,0.0000
2,TSLA,2020-12-31,EV Company News For The Month Of December 2020,https://cdn.snapi.dev/images/v1/l/r/sssik22-c5...,Seeking Alpha,Global electric car sales records for November...,https://seekingalpha.com/article/4396884-ev-co...,0.0000,0.8402
3,TSLA,2020-12-31,"Tesla, Volkswagen, Renault See Strong Share In...",https://cdn.snapi.dev/images/v1/v/x/s3xy-14.jpg,Benzinga,The European market continues to see strong ad...,https://www.benzinga.com/news/20/12/18973120/t...,0.6705,0.5106
4,TSLA,2020-12-31,Tech's top seven companies added $3.4 trillion...,https://cdn.snapi.dev/images/v1/s/t/stocks23-1...,CNBC,"Big Tech got much bigger in 2020, and Tesla jo...",https://www.cnbc.com/2020/12/31/techs-top-seve...,0.4939,0.0000
...,...,...,...,...,...,...,...,...,...
5817,TSLA,2019-01-18,Behind Elon Musk's Hiring and Firing Spree,https://cdn.snapi.dev/images/v1/v/i/viwtqd6wkp...,Bloomberg Technology,Elon Musk is cutting Tesla Inc.'s workforce by...,https://www.youtube.com/watch?v=ViwTqD6WKpA,-0.3400,-0.4404
5818,TSLA,2019-01-17,Tesla to cut full time workforce by roughly 7%...,https://cdn.snapi.dev/images/v1/t/8/t8cnbozbfx...,CNBC Television,The Wall Street Journal is reporting that Tesl...,https://www.youtube.com/watch?v=t8CNBOZBFXc,-0.2732,0.0258
5819,TSLA,2019-01-07,Cramer: New Tesla factory in Shanghai will wor...,https://cdn.snapi.dev/images/v1/w/g/wgegqwogy8...,CNBC Television,CNBC's Jim Cramer discusses his take on the la...,https://www.youtube.com/watch?v=WGEgQWogY8E,0.0000,0.0000
5820,TSLA,2019-01-03,Tesla stock drops over missed delivery estimat...,https://cdn.snapi.dev/images/v1/b/e/bebskx74-2...,Fox Business,“Bulls & Bears” panel discusses how Tesla shar...,https://www.youtube.com/watch?v=Bebskx74-2Y,-0.5267,-0.1280


# Calculate title and text's daily polarity mean respectively

In [12]:
polarity = []
# Calcuate mean on daily basis
for each_df in df:
  each_title_mean = each_df.groupby('publishedDate', as_index=False)['title_compound'].mean()
  each_text_mean = each_df.groupby('publishedDate', as_index=False)['text_compound'].mean()
  each_polarity = pd.merge(each_title_mean, each_text_mean, on='publishedDate')
  each_polarity['symbol'] = each_df['symbol']
  polarity.append(each_polarity)

In [13]:
polarity[0]

Unnamed: 0,publishedDate,title_compound,text_compound,symbol
0,2019-01-01,-0.226300,-0.296000,TSLA
1,2019-01-03,-0.526700,-0.128000,TSLA
2,2019-01-07,0.000000,0.000000,TSLA
3,2019-01-17,-0.273200,0.025800,TSLA
4,2019-01-18,-0.340000,-0.440400,TSLA
...,...,...,...,...
429,2020-12-25,0.148000,0.509500,TSLA
430,2020-12-28,0.081800,0.378638,TSLA
431,2020-12-29,0.209004,0.202939,TSLA
432,2020-12-30,0.209600,0.323291,TSLA


# Merge everyday's closing price with polarity dataframe, but show everyday's closing price

In [14]:
# Merge everyday's closing price with polarity dataframe
index = 0
for ticker in ALL_TICKERS:
  data = pd.read_csv("./stock_price/compare_previous_day/"+ticker[1:]+".csv")
  data = data[['Date', 'Close']]
  data['publishedDate'] = data['Date']
  data = data[['publishedDate', 'Close']]
  polarity[index] = pd.merge(data, polarity[index], on = 'publishedDate', how = 'left').fillna(0)
  index+=1


In [15]:
# Fill symbol column for each company
for index, each_polarity in enumerate(polarity):
  each_polarity['symbol'] = ALL_TICKERS[index][1:]

In [16]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol
0,2018-12-31,66.559998,0.000000,0.000000,TSLA
1,2019-01-02,62.023998,0.000000,0.000000,TSLA
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA
3,2019-01-04,63.537998,0.000000,0.000000,TSLA
4,2019-01-07,66.991997,0.000000,0.000000,TSLA
...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA
502,2020-12-28,663.690002,0.081800,0.378638,TSLA
503,2020-12-29,665.989990,0.209004,0.202939,TSLA
504,2020-12-30,694.780029,0.209600,0.323291,TSLA


# Apply MinMax scaler to Close price

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
for each_polarity in polarity:
  close = each_polarity['Close']
  scaler = MinMaxScaler(feature_range=(0,1))
  close = scaler.fit_transform(np.array(close).reshape(-1,1))
  each_polarity['today_scaler'] = close

In [19]:
polarity[1]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,today_scaler
0,2018-12-31,41.413662,0.000000,0.000000,PFE,0.926184
1,2019-01-02,41.034157,0.664100,0.000000,PFE,0.901746
2,2019-01-03,39.886147,0.000000,0.000000,PFE,0.827823
3,2019-01-04,40.796963,0.000000,0.000000,PFE,0.886473
4,2019-01-07,41.015179,0.000000,0.000000,PFE,0.900524
...,...,...,...,...,...,...
501,2020-12-24,37.270000,0.117457,0.595100,PFE,0.659361
502,2020-12-28,36.820000,0.084740,0.266908,PFE,0.630384
503,2020-12-29,37.049999,0.137173,0.337327,PFE,0.645194
504,2020-12-30,36.740002,0.305817,0.653600,PFE,0.625233


# Merge everyday's indicator with poliarity dataframe

In [20]:
for each_polarity in polarity:
  data = pd.read_csv("./stock_price/compare_previous_day/"+each_polarity.symbol.loc[0]+".csv")
  each_polarity['indicator'] = data['indicator']

In [21]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,today_scaler,indicator
0,2018-12-31,66.559998,0.000000,0.000000,TSLA,0.045928,0.0
1,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157,0.0
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243,0.0
3,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417,1.0
4,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573,1.0
...,...,...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466,1.0
502,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332,1.0
503,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765,1.0
504,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743,1.0


# Add previous 7 day's scaler to each row

In [22]:
for each_polarity in polarity:
  # Add scaler to first row
  for i in range(0, 7):
    name="scaler-"+str(i+1)
    each_polarity.loc[0, name] = each_polarity.loc[0, 'today_scaler']
  # Add scaler to the remaining rows
  for i in range(1, len(each_polarity)):
    for s in range(0, 6):
      name="scaler-"+str(s+1)
      if (i-(s+1) >= 0):
        each_polarity.loc[i, name] = each_polarity.loc[i-(s+1), 'today_scaler']
      else:
        each_polarity.loc[i, name] = each_polarity.loc[i-1, 'today_scaler']

In [23]:
polarity[0]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,today_scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6,scaler-7
0,2018-12-31,66.559998,0.000000,0.000000,TSLA,0.045928,0.0,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928
1,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157,0.0,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928,
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243,0.0,0.039157,0.045928,0.039157,0.039157,0.039157,0.039157,
3,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417,1.0,0.036243,0.039157,0.045928,0.036243,0.036243,0.036243,
4,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573,1.0,0.041417,0.036243,0.039157,0.045928,0.041417,0.041417,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466,1.0,0.910894,0.902475,0.916686,0.984072,0.925703,0.876246,
502,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332,1.0,0.934466,0.910894,0.902475,0.916686,0.984072,0.925703,
503,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765,1.0,0.937332,0.934466,0.910894,0.902475,0.916686,0.984072,
504,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743,1.0,0.940765,0.937332,0.934466,0.910894,0.902475,0.916686,


# Add S&P 500 Index

In [24]:
sp_df = pd.read_csv("./stock_price/original/S&P500.csv")
def reorder_date(date):
  return date[6:10]+'-'+date[0:5]
sp_df['publishedDate'] = sp_df['Date'].apply(reorder_date)
sp_df = sp_df[['publishedDate', 'Close']]
sp_df = sp_df.iloc[::-1]
sp_df = sp_df.rename(columns= {'Close': "s&p500_close_price"}, inplace=False)
# Convert str to float
sp_df['s&p500_close_price'] = pd.to_numeric(sp_df['s&p500_close_price'], downcast="float")
sp_df


Unnamed: 0,publishedDate,s&p500_close_price
504,2019-01-02,2510.030029
503,2019-01-03,2447.889893
502,2019-01-04,2531.939941
501,2019-01-07,2549.689941
500,2019-01-08,2574.409912
...,...,...
4,2020-12-24,3703.060059
3,2020-12-28,3735.360107
2,2020-12-29,3727.040039
1,2020-12-30,3732.040039


In [25]:
# Merge with polarity dataframe
for index, ep in enumerate(polarity):
  polarity[index] = pd.merge(ep, sp_df, on="publishedDate")

In [26]:
polarity[1]

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,today_scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6,scaler-7,s&p500_close_price
0,2019-01-02,41.034157,0.664100,0.000000,PFE,0.901746,0.0,0.926184,0.926184,0.926184,0.926184,0.926184,0.926184,,2510.030029
1,2019-01-03,39.886147,0.000000,0.000000,PFE,0.827823,0.0,0.901746,0.926184,0.901746,0.901746,0.901746,0.901746,,2447.889893
2,2019-01-04,40.796963,0.000000,0.000000,PFE,0.886473,1.0,0.827823,0.901746,0.926184,0.827823,0.827823,0.827823,,2531.939941
3,2019-01-07,41.015179,0.000000,0.000000,PFE,0.900524,1.0,0.886473,0.827823,0.901746,0.926184,0.886473,0.886473,,2549.689941
4,2019-01-08,41.204933,0.000000,0.000000,PFE,0.912743,1.0,0.900524,0.886473,0.827823,0.901746,0.926184,0.900524,,2574.409912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,2020-12-24,37.270000,0.117457,0.595100,PFE,0.659361,0.0,0.670308,0.625233,0.666444,0.685762,0.708300,0.696065,,3703.060059
501,2020-12-28,36.820000,0.084740,0.266908,PFE,0.630384,0.0,0.659361,0.670308,0.625233,0.666444,0.685762,0.708300,,3735.360107
502,2020-12-29,37.049999,0.137173,0.337327,PFE,0.645194,1.0,0.630384,0.659361,0.670308,0.625233,0.666444,0.685762,,3727.040039
503,2020-12-30,36.740002,0.305817,0.653600,PFE,0.625233,0.0,0.645194,0.630384,0.659361,0.670308,0.625233,0.666444,,3732.040039


# Predict the trend using MLPClassifier model

In [27]:
from sklearn.neural_network import MLPClassifier
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning



In [73]:
@ignore_warnings(category=ConvergenceWarning)
def predict(polarity):
  avg_accuracy = 0
  for each_polarity in polarity:
    y = each_polarity['indicator']
    X = each_polarity.drop(columns=['indicator','publishedDate','symbol'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # apply MLPClassifier
    out_date = each_polarity.publishedDate.values
    #rf = RandomForestRegressor(n_estimators=150)
    nn = MLPClassifier(
      hidden_layer_sizes=(90,10),
      random_state=0,
      max_iter=60,
    )
    nn.fit(X_train, y_train)
    #rf.fit(X_train, y_train)
    y_pred = nn.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    avg_accuracy+=accuracy_score(y_test, y_pred)
    #mse = mean_squared_error(y_test, y_pred)
    #corr = np.corrcoef(X_train, y_train, rowvar=False)[-1, :-1]
    #corr = corr.max()
    accuracy = accuracy_score(y_test, y_pred)
    f = open(each_polarity.symbol.iloc[0]+".summary.csv", "w")
    f.write("accuracy\n")
    f.write("{:.2f}\n".format(accuracy))
    f.close()

    f = open(each_polarity.symbol.iloc[0]+".output.csv", "w")
    f.write("date,predicted_indicator\n")
    for i in range(y_pred.shape[0]):
        f.write("{},{}\n".format(out_date[i], y_pred[i]))
    f.close()
  print("AVERAGE ACCURACY: "+str(avg_accuracy/15))

In [74]:
predict(polarity)

0.6347305389221557
0.5269461077844312
0.6586826347305389
0.8023952095808383
0.5988023952095808
0.6227544910179641
0.5209580838323353
0.6407185628742516
0.6227544910179641
0.6167664670658682
0.592814371257485
0.5606060606060606
0.6347305389221557
0.6706586826347305
0.5988023952095808
AVERAGE ACCURACY: 0.6202080687110628
