In [1]:
import pandas as pd
import numpy as np
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error

# For each ticker (company), store all news into a dataframe

In [2]:
# Return one season's financial news dataframe
def season_dataframe(ticker, season):
    with open('./financial_news/'+ticker+str(season), 'r') as ticker_news:
        content  = ticker_news.read()
    records = json.loads(content)
    df = pd.DataFrame(records)
    return df

In [3]:
# Create dataframe for each ticker, storing its news
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$VRSN", "$GRMN", "$ANET", "$AAL"]
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$GRMN", "$ANET", "$AAL"]
ALL_TICKERS = ["$TSLA"]
ALL_TICKERS = ["$TSLA", "$PFE", "$INTC", "$SPGI", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$GRMN", "$ANET", "$AAL"]
df = []
for ticker in ALL_TICKERS:
  # Concatenate 8 seasons into one dataframe
  df_temp = pd.DataFrame()
  for i in range(1, 9):
      new_df = season_dataframe(ticker, i)
      df_temp = pd.concat([df_temp, new_df], ignore_index=True)
  df.append(df_temp)

Usage: df[0] will return $TSLA news from 2019-01-01 to 2020-12-31

# Calculate Polarity of each news, using nltk.sentiment.vader package

In [4]:
# Calculate vader
vader = SentimentIntensityAnalyzer()

# Helper function, which calculates the sentiment and returns compund score
def cal_compound(t):
    return vader.polarity_scores(t)["compound"]

In [5]:
for each_df in df:
  each_df['title_compound'] = each_df['title'].apply(cal_compound)
  each_df['text_compound'] = each_df['text'].apply(cal_compound)

Convert publishedDate to YYYY-MM-DD

In [6]:
# Helper function: for the date conversion
def remove_time(publish_date):
  return publish_date[0:10]

In [7]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(remove_time)

Convert weekend to next Monday

In [8]:
# Helper function: Convert weekend to next Monday
def moveWeekend(publish_date):
  d = datetime.datetime(int(publish_date[0:4]), int(publish_date[5:7]), int(publish_date[8:10]))
  if (d.weekday() == 5):
    return str(d+datetime.timedelta(days=2))[0:10]
  elif (d.weekday() == 6):
    return str(d+datetime.timedelta(days=1))[0:10]
  else:
    return publish_date

In [9]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(moveWeekend)

In [10]:
df[0]

Unnamed: 0,symbol,publishedDate,title,image,site,text,url,title_compound,text_compound
0,TSLA,2020-12-31,Tesla to deliver China-made Model Y SUVs this ...,https://cdn.snapi.dev/images/v1/5/m/m02d202101...,Reuters,Tesla Inc said on Friday it has started sellin...,https://www.reuters.com/article/us-tesla-china...,0.0000,0.1027
1,TSLA,2020-12-31,2020: Several Chinese Stocks Outperformed Thei...,https://cdn.snapi.dev/images/v1/f/j/catalog-ma...,Seeking Alpha,2020: Several Chinese Stocks Outperformed Thei...,https://seekingalpha.com/article/4396892-2020-...,0.0000,0.0000
2,TSLA,2020-12-31,EV Company News For The Month Of December 2020,https://cdn.snapi.dev/images/v1/l/r/sssik22-c5...,Seeking Alpha,Global electric car sales records for November...,https://seekingalpha.com/article/4396884-ev-co...,0.0000,0.8402
3,TSLA,2020-12-31,"Tesla, Volkswagen, Renault See Strong Share In...",https://cdn.snapi.dev/images/v1/v/x/s3xy-14.jpg,Benzinga,The European market continues to see strong ad...,https://www.benzinga.com/news/20/12/18973120/t...,0.6705,0.5106
4,TSLA,2020-12-31,Tech's top seven companies added $3.4 trillion...,https://cdn.snapi.dev/images/v1/s/t/stocks23-1...,CNBC,"Big Tech got much bigger in 2020, and Tesla jo...",https://www.cnbc.com/2020/12/31/techs-top-seve...,0.4939,0.0000
...,...,...,...,...,...,...,...,...,...
5817,TSLA,2019-01-18,Behind Elon Musk's Hiring and Firing Spree,https://cdn.snapi.dev/images/v1/v/i/viwtqd6wkp...,Bloomberg Technology,Elon Musk is cutting Tesla Inc.'s workforce by...,https://www.youtube.com/watch?v=ViwTqD6WKpA,-0.3400,-0.4404
5818,TSLA,2019-01-17,Tesla to cut full time workforce by roughly 7%...,https://cdn.snapi.dev/images/v1/t/8/t8cnbozbfx...,CNBC Television,The Wall Street Journal is reporting that Tesl...,https://www.youtube.com/watch?v=t8CNBOZBFXc,-0.2732,0.0258
5819,TSLA,2019-01-07,Cramer: New Tesla factory in Shanghai will wor...,https://cdn.snapi.dev/images/v1/w/g/wgegqwogy8...,CNBC Television,CNBC's Jim Cramer discusses his take on the la...,https://www.youtube.com/watch?v=WGEgQWogY8E,0.0000,0.0000
5820,TSLA,2019-01-03,Tesla stock drops over missed delivery estimat...,https://cdn.snapi.dev/images/v1/b/e/bebskx74-2...,Fox Business,“Bulls & Bears” panel discusses how Tesla shar...,https://www.youtube.com/watch?v=Bebskx74-2Y,-0.5267,-0.1280


# Calculate title and text's daily polarity mean respectively

In [11]:
polarity = []
# Calcuate mean on daily basis
for each_df in df:
  each_title_mean = each_df.groupby('publishedDate', as_index=False)['title_compound'].mean()
  each_text_mean = each_df.groupby('publishedDate', as_index=False)['text_compound'].mean()
  each_polarity = pd.merge(each_title_mean, each_text_mean, on='publishedDate')
  each_polarity['symbol'] = each_df['symbol']
  polarity.append(each_polarity)

In [12]:
polarity[0]

Unnamed: 0,publishedDate,title_compound,text_compound,symbol
0,2019-01-01,-0.226300,-0.296000,TSLA
1,2019-01-03,-0.526700,-0.128000,TSLA
2,2019-01-07,0.000000,0.000000,TSLA
3,2019-01-17,-0.273200,0.025800,TSLA
4,2019-01-18,-0.340000,-0.440400,TSLA
...,...,...,...,...
429,2020-12-25,0.148000,0.509500,TSLA
430,2020-12-28,0.081800,0.378638,TSLA
431,2020-12-29,0.209004,0.202939,TSLA
432,2020-12-30,0.209600,0.323291,TSLA


# Merge everyday's closing price with polarity dataframe, but show everyday's closing price

In [13]:
data = pd.read_csv("./stock_price/compare_previous_day/"+"TSLA"+".csv")

data = data[['Date', 'Close']]
data['publishedDate'] = data['Date']
data = data[['publishedDate', 'Close']]
data

Unnamed: 0,publishedDate,Close
0,2018-12-31,66.559998
1,2019-01-02,62.023998
2,2019-01-03,60.071999
3,2019-01-04,63.537998
4,2019-01-07,66.991997
...,...,...
501,2020-12-24,661.770020
502,2020-12-28,663.690002
503,2020-12-29,665.989990
504,2020-12-30,694.780029


In [14]:
test = pd.merge(data, polarity[0], on = 'publishedDate', how = 'left').fillna(0)
test

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol
0,2018-12-31,66.559998,0.000000,0.000000,0
1,2019-01-02,62.023998,0.000000,0.000000,0
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA
3,2019-01-04,63.537998,0.000000,0.000000,0
4,2019-01-07,66.991997,0.000000,0.000000,TSLA
...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA
502,2020-12-28,663.690002,0.081800,0.378638,TSLA
503,2020-12-29,665.989990,0.209004,0.202939,TSLA
504,2020-12-30,694.780029,0.209600,0.323291,TSLA


In [15]:
test['symbol'] = 'TSLA'
test

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol
0,2018-12-31,66.559998,0.000000,0.000000,TSLA
1,2019-01-02,62.023998,0.000000,0.000000,TSLA
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA
3,2019-01-04,63.537998,0.000000,0.000000,TSLA
4,2019-01-07,66.991997,0.000000,0.000000,TSLA
...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA
502,2020-12-28,663.690002,0.081800,0.378638,TSLA
503,2020-12-29,665.989990,0.209004,0.202939,TSLA
504,2020-12-30,694.780029,0.209600,0.323291,TSLA


In [16]:
# Apply MinMax scaler, as LSTM is sensitive to the scale of the data
close_test = test['Close']

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
scaler = MinMaxScaler(feature_range=(0,1))
close_test = scaler.fit_transform(np.array(close_test).reshape(-1,1))
#close_test

In [19]:
test['scaler'] = close_test
test

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,scaler
0,2018-12-31,66.559998,0.000000,0.000000,TSLA,0.045928
1,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243
3,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417
4,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573
...,...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466
502,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332
503,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765
504,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743


In [20]:
# Merge everyday's indicator with poliarity dataframe
data = pd.read_csv("./stock_price/compare_previous_day/"+"TSLA"+".csv")
test['indicator'] = data['indicator']
test



Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,scaler,indicator
0,2018-12-31,66.559998,0.000000,0.000000,TSLA,0.045928,0.0
1,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157,0.0
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243,0.0
3,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417,1.0
4,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573,1.0
...,...,...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466,1.0
502,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332,1.0
503,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765,1.0
504,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743,1.0


In [None]:
test

In [21]:
test.loc[0, 'scaler-1'] = test.loc[0, 'scaler']
test

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,scaler,indicator,scaler-1
0,2018-12-31,66.559998,0.000000,0.000000,TSLA,0.045928,0.0,0.045928
1,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157,0.0,
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243,0.0,
3,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417,1.0,
4,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573,1.0,
...,...,...,...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466,1.0,
502,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332,1.0,
503,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765,1.0,
504,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743,1.0,


In [22]:
for i in range(0, 7):
  name="scaler-"+str(i+1)
  test.loc[0, name] = test.loc[0, 'scaler']
test

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6,scaler-7
0,2018-12-31,66.559998,0.000000,0.000000,TSLA,0.045928,0.0,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928
1,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157,0.0,,,,,,,
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243,0.0,,,,,,,
3,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417,1.0,,,,,,,
4,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573,1.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466,1.0,,,,,,,
502,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332,1.0,,,,,,,
503,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765,1.0,,,,,,,
504,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743,1.0,,,,,,,


In [23]:
for i in range(1, len(test)):
  for scaler in range(0, 7):
    name="scaler-"+str(scaler+1)
    if (i-(scaler+1) >= 0):
      test.loc[i, name] = test.loc[i-(scaler+1), 'scaler']
    else:
      test.loc[i, name] = test.loc[i-1, 'scaler']
test

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6,scaler-7
0,2018-12-31,66.559998,0.000000,0.000000,TSLA,0.045928,0.0,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928
1,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157,0.0,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928
2,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243,0.0,0.039157,0.045928,0.039157,0.039157,0.039157,0.039157,0.039157
3,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417,1.0,0.036243,0.039157,0.045928,0.036243,0.036243,0.036243,0.036243
4,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573,1.0,0.041417,0.036243,0.039157,0.045928,0.041417,0.041417,0.041417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466,1.0,0.910894,0.902475,0.916686,0.984072,0.925703,0.876246,0.891890
502,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332,1.0,0.934466,0.910894,0.902475,0.916686,0.984072,0.925703,0.876246
503,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765,1.0,0.937332,0.934466,0.910894,0.902475,0.916686,0.984072,0.925703
504,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743,1.0,0.940765,0.937332,0.934466,0.910894,0.902475,0.916686,0.984072


In [24]:
# Add S&P 500 index
sp_df = pd.read_csv("./stock_price/original/S&P500.csv")
def reorder_date(date):
  return date[6:10]+'-'+date[0:5]
sp_df['publishedDate'] = sp_df['Date'].apply(reorder_date)
sp_df = sp_df[['publishedDate', 'Close']]
sp_df = sp_df.iloc[::-1]
sp_df = sp_df.rename(columns= {'Close': "s&p500_close_price"}, inplace=False)
# Convert str to float
sp_df['s&p500_close_price'] = pd.to_numeric(sp_df['s&p500_close_price'], downcast="float")
sp_df


Unnamed: 0,publishedDate,s&p500_close_price
504,2019-01-02,2510.030029
503,2019-01-03,2447.889893
502,2019-01-04,2531.939941
501,2019-01-07,2549.689941
500,2019-01-08,2574.409912
...,...,...
4,2020-12-24,3703.060059
3,2020-12-28,3735.360107
2,2020-12-29,3727.040039
1,2020-12-30,3732.040039


In [25]:
test = pd.merge(test, sp_df, on="publishedDate")
test

Unnamed: 0,publishedDate,Close,title_compound,text_compound,symbol,scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6,scaler-7,s&p500_close_price
0,2019-01-02,62.023998,0.000000,0.000000,TSLA,0.039157,0.0,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928,0.045928,2510.030029
1,2019-01-03,60.071999,-0.526700,-0.128000,TSLA,0.036243,0.0,0.039157,0.045928,0.039157,0.039157,0.039157,0.039157,0.039157,2447.889893
2,2019-01-04,63.537998,0.000000,0.000000,TSLA,0.041417,1.0,0.036243,0.039157,0.045928,0.036243,0.036243,0.036243,0.036243,2531.939941
3,2019-01-07,66.991997,0.000000,0.000000,TSLA,0.046573,1.0,0.041417,0.036243,0.039157,0.045928,0.041417,0.041417,0.041417,2549.689941
4,2019-01-08,67.070000,0.000000,0.000000,TSLA,0.046689,1.0,0.046573,0.041417,0.036243,0.039157,0.045928,0.046573,0.046573,2574.409912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,2020-12-24,661.770020,0.158990,0.276630,TSLA,0.934466,1.0,0.910894,0.902475,0.916686,0.984072,0.925703,0.876246,0.891890,3703.060059
501,2020-12-28,663.690002,0.081800,0.378638,TSLA,0.937332,1.0,0.934466,0.910894,0.902475,0.916686,0.984072,0.925703,0.876246,3735.360107
502,2020-12-29,665.989990,0.209004,0.202939,TSLA,0.940765,1.0,0.937332,0.934466,0.910894,0.902475,0.916686,0.984072,0.925703,3727.040039
503,2020-12-30,694.780029,0.209600,0.323291,TSLA,0.983743,1.0,0.940765,0.937332,0.934466,0.910894,0.902475,0.916686,0.984072,3732.040039


In [31]:

avg_accuracy = 0
each_polarity = test

y = each_polarity['indicator']
X = each_polarity.drop(columns=['indicator','publishedDate','symbol'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# apply MLPClassifier
out_date = each_polarity.publishedDate.values
nn = MLPClassifier(
    hidden_layer_sizes=(90,10),
    random_state=0,
    max_iter=60,
)
nn.fit(X_train, y_train)
y_pred = nn.predict(X_test)
print(accuracy_score(y_test, y_pred))
avg_accuracy+=accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
corr = np.corrcoef(X_train, y_train, rowvar=False)[-1, :-1]
corr = corr.max()
""" accuracy = accuracy_score(y_test, y_pred)
f = open(each_polarity.symbol.iloc[0]+".summary.csv", "w")
f.write("accuracy\n")
f.write("{:.2f}\n".format(accuracy))
f.close()

f = open(each_polarity.symbol.iloc[0]+".output.csv", "w")
f.write("date,predicted_indicator\n")
for i in range(y_pred.shape[0]):
    f.write("{},{}\n".format(out_date[i], y_pred[i]))
f.close() """


0.6347305389221557




' accuracy = accuracy_score(y_test, y_pred)\nf = open(each_polarity.symbol.iloc[0]+".summary.csv", "w")\nf.write("accuracy\n")\nf.write("{:.2f}\n".format(accuracy))\nf.close()\n\nf = open(each_polarity.symbol.iloc[0]+".output.csv", "w")\nf.write("date,predicted_indicator\n")\nfor i in range(y_pred.shape[0]):\n    f.write("{},{}\n".format(out_date[i], y_pred[i]))\nf.close() '

# Merge everyday's closing price with polarity dataframe

In [None]:
# Helper function: Fetch closing price by a specific date and ticker name
def fetchClosingPrice(time, ticker):
  data = pd.read_csv("./stock_price/compare_previous_day/"+ticker+".csv")
  data = data[data.Date.isin([time])]
  data = data['Close']
  #return data.iloc[0].item()
  return 0 if len(data.index) == 0 else data.iloc[0].item()

In [None]:
for each_polarity in polarity:
  each_polarity['close_price'] = each_polarity['publishedDate'].apply(fetchClosingPrice, args=(each_polarity.symbol.iloc[0],))

# Merge everyday's indicator with polarity dataframe

In [None]:
def fetchIndicator(time, ticker):
  data = pd.read_csv("./stock_price/compare_previous_day/"+ticker+".csv")
  data = data[data.Date.isin([time])]
  data = data['indicator']
  #return data.iloc[0].item()
  return 0 if len(data.index) == 0 else data.iloc[0].item()

In [None]:
for each_polarity in polarity:
  each_polarity['indicator'] = each_polarity['publishedDate'].apply(fetchIndicator, args=(each_polarity.symbol.iloc[0],))

In [None]:
polarity[0]

# Add S&P 500 Adj Close into dataframe

In [None]:
sp_df = pd.read_csv("./stock_price/original/S&P500.csv")
sp_df

In [None]:
def reorder_date(date):
  return date[6:10]+'-'+date[0:5]

In [None]:
sp_df['publishedDate'] = sp_df['Date'].apply(reorder_date)
sp_df = sp_df[['publishedDate', 'Close']]

In [None]:
sp_df = sp_df.iloc[::-1]

In [None]:
sp_df = sp_df.rename(columns= {'Close': "s&p500_close_price"}, inplace=False)

In [None]:
# Convert str to float
sp_df['s&p500_close_price'] = pd.to_numeric(sp_df['s&p500_close_price'], downcast="float")

In [None]:
for i in range(len(polarity)):
  polarity[i] = pd.merge(polarity[i], sp_df, on="publishedDate")


# Training and Testing Step

In [None]:
for each_polarity in polarity:
  y = each_polarity['indicator']
  X = each_polarity.drop(columns=['indicator','publishedDate','symbol'])
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)
  
  # apply RandomForestRegressor
  out_date = each_polarity.publishedDate.values
  rf = RandomForestRegressor(n_estimators=150)
  rf.fit(X_train, y_train)
  y_pred = rf.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  corr = np.corrcoef(X_train, y_train, rowvar=False)[-1, :-1]
  corr = corr.max()
  f = open(each_polarity.symbol.iloc[0]+".summary.csv", "w")
  f.write("MSE,correlation\n")
  f.write("{},{:.2f}\n".format(mse, corr))
  f.close()

  f = open(each_polarity.symbol.iloc[0]+".output.csv", "w")
  f.write("date,predicted_indicator\n")
  for i in range(y_pred.shape[0]):
      f.write("{},{}\n".format(out_date[i], y_pred[i]))
  f.close()

In [27]:
from sklearn.neural_network import MLPClassifier

In [28]:
avg_accuracy = 0
for each_polarity in polarity:
  y = each_polarity['indicator']
  X = each_polarity.drop(columns=['indicator','publishedDate','symbol'])
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)
  
  # apply MLPClassifier
  out_date = each_polarity.publishedDate.values
  #rf = RandomForestRegressor(n_estimators=150)
  nn = MLPClassifier(
    hidden_layer_sizes=(90,10),
    random_state=0,
    max_iter=60,
  )
  nn.fit(X_train, y_train)
  #rf.fit(X_train, y_train)
  y_pred = nn.predict(X_test)
  #print(accuracy_score(y_test, y_pred))
  avg_accuracy+=accuracy_score(y_test, y_pred)
  #mse = mean_squared_error(y_test, y_pred)
  #corr = np.corrcoef(X_train, y_train, rowvar=False)[-1, :-1]
  #corr = corr.max()
  """ accuracy = accuracy_score(y_test, y_pred)
  f = open(each_polarity.symbol.iloc[0]+".summary.csv", "w")
  f.write("accuracy\n")
  f.write("{:.2f}\n".format(accuracy))
  f.close()

  f = open(each_polarity.symbol.iloc[0]+".output.csv", "w")
  f.write("date,predicted_indicator\n")
  for i in range(y_pred.shape[0]):
      f.write("{},{}\n".format(out_date[i], y_pred[i]))
  f.close() """
print(avg_accuracy/15)

KeyError: 'indicator'