In [1]:
import pandas as pd
import numpy as np
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error

# For each ticker (company), store all news into a dataframe

In [2]:
# Return one season's financial news dataframe
def season_dataframe(ticker, season):
    with open('./financial_news/'+ticker+str(season), 'r') as ticker_news:
        content  = ticker_news.read()
    records = json.loads(content)
    df = pd.DataFrame(records)
    return df

In [36]:
# Create dataframe for each ticker, storing its news
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$VRSN", "$GRMN", "$ANET", "$AAL"]
#ALL_TICKERS = ["$TSLA", "$NVDA", "$INTC", "$PFE", "$SPGI", "$LRCX", "$TMUS", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$CARR", "$GRMN", "$ANET", "$AAL"]
ALL_TICKERS = ["$TSLA", "$INTC", "$PFE", "$SPGI", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$GRMN", "$AAL", "$ANET"]
ALL_TICKERS = ["$TSLA", "$INTC", "$PFE", "$SPGI", "$ADSK", "$VRTX", "$TWTR", "$EBAY", "$GRMN", "$AAL"]

df = []
for ticker in ALL_TICKERS:
  # Concatenate 8 seasons into one dataframe
  df_temp = pd.DataFrame()
  for i in range(1, 9):
      new_df = season_dataframe(ticker, i)
      df_temp = pd.concat([df_temp, new_df], ignore_index=True)
  df.append(df_temp)

Usage: df[0] will return $TSLA news from 2019-01-01 to 2020-12-31

Convert publishedDate to YYYY-MM-DD

In [37]:
# Helper function: for the date conversion
def remove_time(publish_date):
  return publish_date[0:10]

In [38]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(remove_time)

Convert weekend to next Monday

In [39]:
# Helper function: Convert weekend to next Monday
def moveWeekend(publish_date):
  d = datetime.datetime(int(publish_date[0:4]), int(publish_date[5:7]), int(publish_date[8:10]))
  if (d.weekday() == 5):
    return str(d+datetime.timedelta(days=2))[0:10]
  elif (d.weekday() == 6):
    return str(d+datetime.timedelta(days=1))[0:10]
  else:
    return publish_date

In [40]:
for each_df in df:
  each_df['publishedDate'] = each_df['publishedDate'].apply(moveWeekend)

In [41]:
df[0]

Unnamed: 0,symbol,publishedDate,title,image,site,text,url
0,TSLA,2020-12-31,Tesla to deliver China-made Model Y SUVs this ...,https://cdn.snapi.dev/images/v1/5/m/m02d202101...,Reuters,Tesla Inc said on Friday it has started sellin...,https://www.reuters.com/article/us-tesla-china...
1,TSLA,2020-12-31,2020: Several Chinese Stocks Outperformed Thei...,https://cdn.snapi.dev/images/v1/f/j/catalog-ma...,Seeking Alpha,2020: Several Chinese Stocks Outperformed Thei...,https://seekingalpha.com/article/4396892-2020-...
2,TSLA,2020-12-31,EV Company News For The Month Of December 2020,https://cdn.snapi.dev/images/v1/l/r/sssik22-c5...,Seeking Alpha,Global electric car sales records for November...,https://seekingalpha.com/article/4396884-ev-co...
3,TSLA,2020-12-31,"Tesla, Volkswagen, Renault See Strong Share In...",https://cdn.snapi.dev/images/v1/v/x/s3xy-14.jpg,Benzinga,The European market continues to see strong ad...,https://www.benzinga.com/news/20/12/18973120/t...
4,TSLA,2020-12-31,Tech's top seven companies added $3.4 trillion...,https://cdn.snapi.dev/images/v1/s/t/stocks23-1...,CNBC,"Big Tech got much bigger in 2020, and Tesla jo...",https://www.cnbc.com/2020/12/31/techs-top-seve...
...,...,...,...,...,...,...,...
5817,TSLA,2019-01-18,Behind Elon Musk's Hiring and Firing Spree,https://cdn.snapi.dev/images/v1/v/i/viwtqd6wkp...,Bloomberg Technology,Elon Musk is cutting Tesla Inc.'s workforce by...,https://www.youtube.com/watch?v=ViwTqD6WKpA
5818,TSLA,2019-01-17,Tesla to cut full time workforce by roughly 7%...,https://cdn.snapi.dev/images/v1/t/8/t8cnbozbfx...,CNBC Television,The Wall Street Journal is reporting that Tesl...,https://www.youtube.com/watch?v=t8CNBOZBFXc
5819,TSLA,2019-01-07,Cramer: New Tesla factory in Shanghai will wor...,https://cdn.snapi.dev/images/v1/w/g/wgegqwogy8...,CNBC Television,CNBC's Jim Cramer discusses his take on the la...,https://www.youtube.com/watch?v=WGEgQWogY8E
5820,TSLA,2019-01-03,Tesla stock drops over missed delivery estimat...,https://cdn.snapi.dev/images/v1/b/e/bebskx74-2...,Fox Business,“Bulls & Bears” panel discusses how Tesla shar...,https://www.youtube.com/watch?v=Bebskx74-2Y


# Plain text approach bag_of_words

In [42]:
for index, each_df in enumerate(df):
  df[index] = each_df[['symbol', 'publishedDate', 'title', 'text']]


In [43]:
df[0]

Unnamed: 0,symbol,publishedDate,title,text
0,TSLA,2020-12-31,Tesla to deliver China-made Model Y SUVs this ...,Tesla Inc said on Friday it has started sellin...
1,TSLA,2020-12-31,2020: Several Chinese Stocks Outperformed Thei...,2020: Several Chinese Stocks Outperformed Thei...
2,TSLA,2020-12-31,EV Company News For The Month Of December 2020,Global electric car sales records for November...
3,TSLA,2020-12-31,"Tesla, Volkswagen, Renault See Strong Share In...",The European market continues to see strong ad...
4,TSLA,2020-12-31,Tech's top seven companies added $3.4 trillion...,"Big Tech got much bigger in 2020, and Tesla jo..."
...,...,...,...,...
5817,TSLA,2019-01-18,Behind Elon Musk's Hiring and Firing Spree,Elon Musk is cutting Tesla Inc.'s workforce by...
5818,TSLA,2019-01-17,Tesla to cut full time workforce by roughly 7%...,The Wall Street Journal is reporting that Tesl...
5819,TSLA,2019-01-07,Cramer: New Tesla factory in Shanghai will wor...,CNBC's Jim Cramer discusses his take on the la...
5820,TSLA,2019-01-03,Tesla stock drops over missed delivery estimat...,“Bulls & Bears” panel discusses how Tesla shar...


In [44]:
df

[     symbol publishedDate                                              title  \
 0      TSLA    2020-12-31  Tesla to deliver China-made Model Y SUVs this ...   
 1      TSLA    2020-12-31  2020: Several Chinese Stocks Outperformed Thei...   
 2      TSLA    2020-12-31     EV Company News For The Month Of December 2020   
 3      TSLA    2020-12-31  Tesla, Volkswagen, Renault See Strong Share In...   
 4      TSLA    2020-12-31  Tech's top seven companies added $3.4 trillion...   
 ...     ...           ...                                                ...   
 5817   TSLA    2019-01-18         Behind Elon Musk's Hiring and Firing Spree   
 5818   TSLA    2019-01-17  Tesla to cut full time workforce by roughly 7%...   
 5819   TSLA    2019-01-07  Cramer: New Tesla factory in Shanghai will wor...   
 5820   TSLA    2019-01-03  Tesla stock drops over missed delivery estimat...   
 5821   TSLA    2019-01-01  Tesla misses Wall Street estimates with 90,700...   
 
                          

In [45]:
polarity = df
polarity[0]

Unnamed: 0,symbol,publishedDate,title,text
0,TSLA,2020-12-31,Tesla to deliver China-made Model Y SUVs this ...,Tesla Inc said on Friday it has started sellin...
1,TSLA,2020-12-31,2020: Several Chinese Stocks Outperformed Thei...,2020: Several Chinese Stocks Outperformed Thei...
2,TSLA,2020-12-31,EV Company News For The Month Of December 2020,Global electric car sales records for November...
3,TSLA,2020-12-31,"Tesla, Volkswagen, Renault See Strong Share In...",The European market continues to see strong ad...
4,TSLA,2020-12-31,Tech's top seven companies added $3.4 trillion...,"Big Tech got much bigger in 2020, and Tesla jo..."
...,...,...,...,...
5817,TSLA,2019-01-18,Behind Elon Musk's Hiring and Firing Spree,Elon Musk is cutting Tesla Inc.'s workforce by...
5818,TSLA,2019-01-17,Tesla to cut full time workforce by roughly 7%...,The Wall Street Journal is reporting that Tesl...
5819,TSLA,2019-01-07,Cramer: New Tesla factory in Shanghai will wor...,CNBC's Jim Cramer discusses his take on the la...
5820,TSLA,2019-01-03,Tesla stock drops over missed delivery estimat...,“Bulls & Bears” panel discusses how Tesla shar...


In [46]:
# Merge title, text based on publishedDate
for index, each_polarity in enumerate(polarity):
  symbol = each_polarity['symbol'][0]
  title = each_polarity.groupby(['publishedDate'])['title'].apply(','.join).reset_index()
  text = each_polarity.groupby(['publishedDate'])['text'].apply(','.join).reset_index()
  total = pd.merge(title, text, on="publishedDate")
  polarity[index] = total
  polarity[index]['symbol'] = symbol

In [47]:
polarity[0]

Unnamed: 0,publishedDate,title,text,symbol
0,2019-01-01,"Tesla misses Wall Street estimates with 90,700...",CNBC's Phil LeBeau reports on Tesla's deliveri...,TSLA
1,2019-01-03,Tesla stock drops over missed delivery estimat...,“Bulls & Bears” panel discusses how Tesla shar...,TSLA
2,2019-01-07,Cramer: New Tesla factory in Shanghai will wor...,CNBC's Jim Cramer discusses his take on the la...,TSLA
3,2019-01-17,Tesla to cut full time workforce by roughly 7%...,The Wall Street Journal is reporting that Tesl...,TSLA
4,2019-01-18,Behind Elon Musk's Hiring and Firing Spree,Elon Musk is cutting Tesla Inc.'s workforce by...,TSLA
...,...,...,...,...
429,2020-12-25,TSLA Stock Price: $788 Target By JMP Securitie...,The stock price of Tesla Inc (NASDAQ: TSLA) ha...,TSLA
430,2020-12-28,"Tesla To Enter India Early Next Year, Transpor...",Tesla Inc (NASDAQ: TSLA) is set to debut in In...,TSLA
431,2020-12-29,Tesla Stock Price: Target Increase From $300 T...,Shares of Tesla Inc (NASDAQ: TSLA) have receiv...,TSLA
432,2020-12-30,"Move Over, Warren Buffett: This Is the Star In...","Berkshire's best days seem to be behind it, bu...",TSLA


In [48]:
# Merge everyday's closing price with polarity dataframe
index = 0
for ticker in ALL_TICKERS:
  data = pd.read_csv("./stock_price/compare_previous_day/"+ticker[1:]+".csv")
  data = data[['Date', 'Close']]
  data['publishedDate'] = data['Date']
  data = data[['publishedDate', 'Close']]
  polarity[index] = pd.merge(data, polarity[index], on = 'publishedDate', how = 'left').fillna(0)
  polarity[index]['symbol'] = ALL_TICKERS[index][1:]
  index+=1


In [49]:
polarity[0]

Unnamed: 0,publishedDate,Close,title,text,symbol
0,2018-12-31,66.559998,0,0,TSLA
1,2019-01-02,62.023998,0,0,TSLA
2,2019-01-03,60.071999,Tesla stock drops over missed delivery estimat...,“Bulls & Bears” panel discusses how Tesla shar...,TSLA
3,2019-01-04,63.537998,0,0,TSLA
4,2019-01-07,66.991997,Cramer: New Tesla factory in Shanghai will wor...,CNBC's Jim Cramer discusses his take on the la...,TSLA
...,...,...,...,...,...
501,2020-12-24,661.770020,"Top ETF Stories of 2020 & Picks for 2021,Apple...",We discuss the 2020 trends and outlook for 202...,TSLA
502,2020-12-28,663.690002,"Tesla To Enter India Early Next Year, Transpor...",Tesla Inc (NASDAQ: TSLA) is set to debut in In...,TSLA
503,2020-12-29,665.989990,Tesla Stock Price: Target Increase From $300 T...,Shares of Tesla Inc (NASDAQ: TSLA) have receiv...,TSLA
504,2020-12-30,694.780029,"Move Over, Warren Buffett: This Is the Star In...","Berkshire's best days seem to be behind it, bu...",TSLA


In [50]:
from sklearn.preprocessing import MinMaxScaler

In [51]:
for each_polarity in polarity:
  close = each_polarity['Close']
  scaler = MinMaxScaler(feature_range=(0,1))
  close = scaler.fit_transform(np.array(close).reshape(-1,1))
  each_polarity['today_scaler'] = close

In [52]:
for each_polarity in polarity:
  data = pd.read_csv("./stock_price/compare_previous_day/"+each_polarity.symbol.loc[0]+".csv")
  each_polarity['indicator'] = data['indicator']

In [53]:
polarity[0]

Unnamed: 0,publishedDate,Close,title,text,symbol,today_scaler,indicator
0,2018-12-31,66.559998,0,0,TSLA,0.045928,0.0
1,2019-01-02,62.023998,0,0,TSLA,0.039157,0.0
2,2019-01-03,60.071999,Tesla stock drops over missed delivery estimat...,“Bulls & Bears” panel discusses how Tesla shar...,TSLA,0.036243,0.0
3,2019-01-04,63.537998,0,0,TSLA,0.041417,1.0
4,2019-01-07,66.991997,Cramer: New Tesla factory in Shanghai will wor...,CNBC's Jim Cramer discusses his take on the la...,TSLA,0.046573,1.0
...,...,...,...,...,...,...,...
501,2020-12-24,661.770020,"Top ETF Stories of 2020 & Picks for 2021,Apple...",We discuss the 2020 trends and outlook for 202...,TSLA,0.934466,1.0
502,2020-12-28,663.690002,"Tesla To Enter India Early Next Year, Transpor...",Tesla Inc (NASDAQ: TSLA) is set to debut in In...,TSLA,0.937332,1.0
503,2020-12-29,665.989990,Tesla Stock Price: Target Increase From $300 T...,Shares of Tesla Inc (NASDAQ: TSLA) have receiv...,TSLA,0.940765,1.0
504,2020-12-30,694.780029,"Move Over, Warren Buffett: This Is the Star In...","Berkshire's best days seem to be behind it, bu...",TSLA,0.983743,1.0


# Apply bag_of_words and TFIDF

In [54]:
import re
a = "Cramer: New Tesla factory i"
a = re.sub("[^a-zA-Z]+", " ", a)
print(a)

Cramer New Tesla factory i


In [55]:
# Remove punctuations
def removePunc(st):
  s = str(st)
  s = re.sub("[^a-zA-Z]+", " ", s)
  return s

In [56]:
for each_polarity in polarity:
  each_polarity['title'] = each_polarity['title'].apply(removePunc)
  each_polarity['text'] = each_polarity['text'].apply(removePunc)
  #print(each_polarity['title_p'])

In [77]:
polarity[1]

Unnamed: 0,publishedDate,Close,title,text,symbol,today_scaler,indicator,scaler-1,scaler-2,scaler-3,scaler-4,scaler-5,scaler-6
0,2018-12-31,46.930000,,,INTC,0.138745,0.0,0.138745,0.138745,0.138745,0.138745,0.138745,0.138745
1,2019-01-02,47.080002,,,INTC,0.144742,1.0,0.138745,0.138745,0.138745,0.138745,0.138745,0.138745
2,2019-01-03,44.490002,final trade xrt cbs more,final trade xrt cbs more,INTC,0.041184,0.0,0.144742,0.138745,0.144742,0.144742,0.144742,0.144742
3,2019-01-04,47.220001,,,INTC,0.150340,1.0,0.041184,0.144742,0.138745,0.041184,0.041184,0.041184
4,2019-01-07,47.439999,intel ces event in minutes,intel ces event in minutes,INTC,0.159136,1.0,0.150340,0.041184,0.144742,0.138745,0.150340,0.150340
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,2020-12-24,47.070000,,,INTC,0.144342,1.0,0.124350,0.108357,0.115954,0.159936,0.287485,0.306277
502,2020-12-28,47.070000,barron s post christmas picks and pans alibaba...,barron s post christmas picks and pans alibaba...,INTC,0.144342,0.0,0.144342,0.124350,0.108357,0.115954,0.159936,0.287485
503,2020-12-29,49.389999,noted activist investor calls on intel to expl...,noted activist investor calls on intel to expl...,INTC,0.237105,1.0,0.144342,0.144342,0.124350,0.108357,0.115954,0.159936
504,2020-12-30,48.750000,intel intc stock sinks as market gains what yo...,intel intc stock sinks as market gains what yo...,INTC,0.211515,0.0,0.237105,0.144342,0.144342,0.124350,0.108357,0.115954


In [58]:
# Convert to lower cases
for each_polarity in polarity:
  each_polarity['title'] = each_polarity['title'].str.lower()
  each_polarity['text'] = each_polarity['title'].str.lower()

In [76]:
previous_days = 6
for each_polarity in polarity:
  # Add scaler to first row
  for i in range(0, previous_days):
    name="scaler-"+str(i+1)
    each_polarity.loc[0, name] = each_polarity.loc[0, 'today_scaler']
  # Add scaler to the remaining rows
  for i in range(1, len(each_polarity)):
    for s in range(0, previous_days):
      name="scaler-"+str(s+1)
      if (i-(s+1) >= 0):
        each_polarity.loc[i, name] = each_polarity.loc[i-(s+1), 'today_scaler']
      else:
        each_polarity.loc[i, name] = each_polarity.loc[i-1, 'today_scaler']

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [78]:
for each_polarity in polarity:
  each_polarity = each_polarity[['publishedDate', 'symbol', 'indicator', 'Close', 'title', 'text', 'today_scaler', 'scaler-1', 'scaler-2', 'scaler-3', 'scaler-4', 'scaler-5', 'scaler-6']]
  print(each_polarity['symbol'][0])
  test = each_polarity
  train=test[test['publishedDate']<'2020-06-01']
  test=test[test['publishedDate']>'2020-05-31']
  data=train.iloc[:,2:4]
  data.replace("[^a-zA-Z]", " ",regex=True, inplace=True)
  # Join title and text
  headlines = []
  for row in range(0, len(data.index)):
    headlines.append(' '.join(str(x) for x in data.iloc[row, 0:2]))
  #implement TF-IDF
  tfvector=TfidfVectorizer(ngram_range=(2,3))
  train_df=tfvector.fit_transform(headlines)
  # MultinomialNB
  nb=MultinomialNB()
  nb.fit(train_df,train['indicator'])
  # Predict for the Test Dataset
  test_transform= []
  for row in range(0,len(test.index)):
      test_transform.append(' '.join(str(x) for x in test.iloc[row,4:13]))
  test_dataset = tfvector.transform(test_transform)


  predictions = nb.predict(test_dataset)
  matrix=confusion_matrix(test['indicator'],predictions)
  print(matrix)
  score=accuracy_score(test['indicator'],predictions)
  print(score)
  report=classification_report(test['indicator'],predictions)
  print(report)


TSLA
[[ 0 61]
 [ 0 89]]
0.5933333333333334
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        61
         1.0       0.59      1.00      0.74        89

    accuracy                           0.59       150
   macro avg       0.30      0.50      0.37       150
weighted avg       0.35      0.59      0.44       150

INTC


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[ 0 70]
 [ 0 80]]
0.5333333333333333
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        70
         1.0       0.53      1.00      0.70        80

    accuracy                           0.53       150
   macro avg       0.27      0.50      0.35       150
weighted avg       0.28      0.53      0.37       150

PFE
[[ 0 81]
 [ 0 69]]
0.46
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        81
         1.0       0.46      1.00      0.63        69

    accuracy                           0.46       150
   macro avg       0.23      0.50      0.32       150
weighted avg       0.21      0.46      0.29       150

SPGI


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[ 0 75]
 [ 0 75]]
0.5
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        75
         1.0       0.50      1.00      0.67        75

    accuracy                           0.50       150
   macro avg       0.25      0.50      0.33       150
weighted avg       0.25      0.50      0.33       150

ADSK
[[ 0 64]
 [ 0 86]]
0.5733333333333334
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        64
         1.0       0.57      1.00      0.73        86

    accuracy                           0.57       150
   macro avg       0.29      0.50      0.36       150
weighted avg       0.33      0.57      0.42       150

VRTX


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[ 0 73]
 [ 0 77]]
0.5133333333333333
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        73
         1.0       0.51      1.00      0.68        77

    accuracy                           0.51       150
   macro avg       0.26      0.50      0.34       150
weighted avg       0.26      0.51      0.35       150

TWTR
[[ 0 61]
 [ 0 89]]
0.5933333333333334
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        61
         1.0       0.59      1.00      0.74        89

    accuracy                           0.59       150
   macro avg       0.30      0.50      0.37       150
weighted avg       0.35      0.59      0.44       150

EBAY


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[ 0 72]
 [ 0 78]]
0.52
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        72
         1.0       0.52      1.00      0.68        78

    accuracy                           0.52       150
   macro avg       0.26      0.50      0.34       150
weighted avg       0.27      0.52      0.36       150

GRMN
[[ 0 70]
 [ 0 80]]
0.5333333333333333
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        70
         1.0       0.53      1.00      0.70        80

    accuracy                           0.53       150
   macro avg       0.27      0.50      0.35       150
weighted avg       0.28      0.53      0.37       150

AAL
[[79  0]
 [71  0]]
0.5266666666666666
              precision    recall  f1-score   support

         0.0       0.53      1.00      0.69        79
         1.0       0.00      0.00      0.00        71

    accuracy                           0.53       150
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
test = polarity[0]

In [None]:
polarity[0]

In [None]:
train=test[test['publishedDate']<'2020-06-01']
test=test[test['publishedDate']>'2020-05-31']

In [None]:
data=train.iloc[:,2:4]
data.replace("[^a-zA-Z]", " ",regex=True, inplace=True)

In [None]:
# Join title and text
headlines = []
for row in range(0, len(data.index)):
  headlines.append(' '.join(str(x) for x in data.iloc[row, 0:2]))

In [None]:
#headlines

In [None]:
#implement TF-IDF
tfvector=TfidfVectorizer(ngram_range=(2,3))
train_df=tfvector.fit_transform(headlines)

In [None]:
train_df

In [None]:
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(train_df,train['indicator'])


In [None]:
test

In [None]:
# Predict for the Test Dataset
test_transform= []
for row in range(0,len(test.index)):
    test_transform.append(' '.join(str(x) for x in test.iloc[row,1:4]))
test_dataset = tfvector.transform(test_transform)

In [None]:


predictions = nb.predict(test_dataset)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
matrix=confusion_matrix(test['indicator'],predictions)
print(matrix)
score=accuracy_score(test['indicator'],predictions)
print(score)
report=classification_report(test['indicator'],predictions)
print(report)

In [63]:
print(predictions[2])

1.0


In [None]:
test[['title', 'text']].replace("[^a-zA-Z]", " ",regex=True, inplace=True)


In [None]:
a

In [None]:
polarity[0]

# BELOW WAS PAST CODE

In [None]:
previous_days = 5
for each_polarity in polarity:
  # Add scaler to first row
  for i in range(0, previous_days):
    name="scaler-"+str(i+1)
    each_polarity.loc[0, name] = each_polarity.loc[0, 'today_scaler']
  # Add scaler to the remaining rows
  for i in range(1, len(each_polarity)):
    for s in range(0, previous_days):
      name="scaler-"+str(s+1)
      if (i-(s+1) >= 0):
        each_polarity.loc[i, name] = each_polarity.loc[i-(s+1), 'today_scaler']
      else:
        each_polarity.loc[i, name] = each_polarity.loc[i-1, 'today_scaler']

In [None]:
polarity[0]

# Calculate title and text's daily polarity mean respectively

In [None]:
polarity = []
# Calcuate mean on daily basis
for each_df in df:
  each_title_mean = each_df.groupby('publishedDate', as_index=False)['title_compound'].mean()
  each_text_mean = each_df.groupby('publishedDate', as_index=False)['text_compound'].mean()
  each_polarity = pd.merge(each_title_mean, each_text_mean, on='publishedDate')
  each_polarity['symbol'] = each_df['symbol']
  polarity.append(each_polarity)

In [None]:
#polarity[0]

# Merge everyday's closing price with polarity dataframe, but show everyday's closing price

In [None]:
# Merge everyday's closing price with polarity dataframe
index = 0
for ticker in ALL_TICKERS:
  data = pd.read_csv("./stock_price/compare_previous_day/"+ticker[1:]+".csv")
  data = data[['Date', 'Close']]
  data['publishedDate'] = data['Date']
  data = data[['publishedDate', 'Close']]
  polarity[index] = pd.merge(data, polarity[index], on = 'publishedDate', how = 'left').fillna(0)
  index+=1


In [None]:
# Fill symbol column for each company
for index, each_polarity in enumerate(polarity):
  each_polarity['symbol'] = ALL_TICKERS[index][1:]

In [None]:
#polarity[0]

# Apply MinMax scaler to Close price

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
for each_polarity in polarity:
  close = each_polarity['Close']
  scaler = MinMaxScaler(feature_range=(0,1))
  close = scaler.fit_transform(np.array(close).reshape(-1,1))
  each_polarity['today_scaler'] = close

In [None]:
#polarity[1]

# Merge everyday's indicator with poliarity dataframe

In [None]:
for each_polarity in polarity:
  data = pd.read_csv("./stock_price/compare_previous_day/"+each_polarity.symbol.loc[0]+".csv")
  each_polarity['indicator'] = data['indicator']

In [None]:
#polarity[0]

# Add previous 3 day's scaler to each row

In [None]:
previous_days = 5
for each_polarity in polarity:
  # Add scaler to first row
  for i in range(0, previous_days):
    name="scaler-"+str(i+1)
    each_polarity.loc[0, name] = each_polarity.loc[0, 'today_scaler']
  # Add scaler to the remaining rows
  for i in range(1, len(each_polarity)):
    for s in range(0, previous_days):
      name="scaler-"+str(s+1)
      if (i-(s+1) >= 0):
        each_polarity.loc[i, name] = each_polarity.loc[i-(s+1), 'today_scaler']
      else:
        each_polarity.loc[i, name] = each_polarity.loc[i-1, 'today_scaler']

In [None]:
polarity[0]

# Add S&P 500 Index

In [None]:
sp_df = pd.read_csv("./stock_price/original/S&P500.csv")
def reorder_date(date):
  return date[6:10]+'-'+date[0:5]
sp_df['publishedDate'] = sp_df['Date'].apply(reorder_date)
sp_df = sp_df[['publishedDate', 'Close']]
sp_df = sp_df.iloc[::-1]
sp_df = sp_df.rename(columns= {'Close': "s&p500_close_price"}, inplace=False)
# Convert str to float
sp_df['s&p500_close_price'] = pd.to_numeric(sp_df['s&p500_close_price'], downcast="float")
sp_df


In [None]:
# Merge with polarity dataframe
for index, ep in enumerate(polarity):
  polarity[index] = pd.merge(ep, sp_df, on="publishedDate")

In [None]:
polarity[1]

# Predict the trend using MLPClassifier model

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [None]:
@ignore_warnings(category=ConvergenceWarning)
def predict(polarity):
  avg_accuracy = 0
  for each_polarity in polarity:
    y = each_polarity['indicator']
    X = each_polarity.drop(columns=['indicator','publishedDate','symbol'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # apply MLPClassifier
    out_date = each_polarity.publishedDate.values
    #rf = RandomForestRegressor(n_estimators=150)
    nn = MLPClassifier(
      hidden_layer_sizes=(90,10),
      random_state=0,
      max_iter=60,
    )
    nn.fit(X_train, y_train)
    #rf.fit(X_train, y_train)
    y_pred = nn.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    avg_accuracy+=accuracy_score(y_test, y_pred)
    #mse = mean_squared_error(y_test, y_pred)
    #corr = np.corrcoef(X_train, y_train, rowvar=False)[-1, :-1]
    #corr = corr.max()
    accuracy = accuracy_score(y_test, y_pred)
    f = open(each_polarity.symbol.iloc[0]+".summary.csv", "w")
    f.write("accuracy\n")
    f.write("{:.2f}\n".format(accuracy))
    f.close()

    f = open(each_polarity.symbol.iloc[0]+".output.csv", "w")
    f.write("date,predicted_indicator\n")
    for i in range(y_pred.shape[0]):
        f.write("{},{}\n".format(out_date[i], y_pred[i]))
    f.close()
  print("AVERAGE ACCURACY: "+str(avg_accuracy/11))

In [None]:
predict(polarity)