In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
import xgboost
import lightgbm



In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

Textual News Preparation

In [3]:
news_df = pd.read_csv('/content/india-news-headlines.csv')

In [4]:
news_df.shape

(3424067, 3)

In [5]:
news_df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [6]:
news_df.drop('headline_category', axis=1, inplace=True)
news_df = news_df.rename(columns={'publish_date':'Date', 'headline_text':'News'}, inplace=False)

In [7]:
news_df.head()

Unnamed: 0,Date,News
0,20010102,Status quo will not be disturbed at Ayodhya; s...
1,20010102,Fissures in Hurriyat over Pak visit
2,20010102,America's unwanted heading for India?
3,20010102,For bigwigs; it is destination Goa
4,20010102,Extra buses to clear tourist traffic


In [8]:
news_df["Date"] = pd.to_datetime(news_df["Date"],format='%Y%m%d')
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424067 entries, 0 to 3424066
Data columns (total 2 columns):
 #   Column  Dtype         
---  ------  -----         
 0   Date    datetime64[ns]
 1   News    object        
dtypes: datetime64[ns](1), object(1)
memory usage: 52.2+ MB


In [9]:
news_df.head()

Unnamed: 0,Date,News
0,2001-01-02,Status quo will not be disturbed at Ayodhya; s...
1,2001-01-02,Fissures in Hurriyat over Pak visit
2,2001-01-02,America's unwanted heading for India?
3,2001-01-02,For bigwigs; it is destination Goa
4,2001-01-02,Extra buses to clear tourist traffic


In [10]:
# Group the headlines for each day
news_df['News'] = news_df.groupby(['Date']).transform(lambda x : ' '.join(x)) 
news_df = news_df.drop_duplicates() 
news_df.reset_index(inplace=True,drop=True)

In [11]:
#cheak for missing values
news_df.isnull().sum()

Date    0
News    0
dtype: int64

In [12]:
news_df.shape

(7262, 2)

## Stock data Analysis

In [13]:
S_df = pd.read_csv('/content/st.csv')

In [14]:
S_df['Date'] = pd.to_datetime(S_df.Date)

In [15]:
len(S_df)

2763

Remove Unwanted Characters from the News

In [16]:
#removing unwanted characters from the News
news_df.replace("[^a-zA-Z']"," ",regex=True,inplace=True)
news_df["News"].head(5)

0    Status quo will not be disturbed at Ayodhya  s...
1    Powerless north India gropes in the dark Think...
2    The string that pulled Stephen Hawking to Indi...
3    Light combat craft takes India into club class...
4    Light combat craft takes India into club class...
Name: News, dtype: object




## Textual Analysis

In [17]:
#Functions to get the subjectivity and polarity
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
  return  TextBlob(text).sentiment.polarity

In [18]:
#Adding subjectivity and polarity columns
news_df['Subjectivity'] = news_df['News'].apply(getSubjectivity)
news_df['Polarity'] = news_df['News'].apply(getPolarity)
news_df

Unnamed: 0,Date,News,Subjectivity,Polarity
0,2001-01-02,Status quo will not be disturbed at Ayodhya s...,0.286859,0.143590
1,2001-01-03,Powerless north India gropes in the dark Think...,0.392857,0.089286
2,2001-01-04,The string that pulled Stephen Hawking to Indi...,0.445360,0.093039
3,2001-01-05,Light combat craft takes India into club class...,0.480553,0.264024
4,2001-01-06,Light combat craft takes India into club class...,0.439394,0.248485
...,...,...,...,...
7257,2020-12-27,BigInterview Dhritiman Chatterjee Nobody da...,0.392082,0.042978
7258,2020-12-28,Horoscope Today December Check astro...,0.409973,0.071405
7259,2020-12-29,Man recovers charred remains of 'thief' from h...,0.415684,0.060775
7260,2020-12-30,Numerology Readings December Predicti...,0.436863,0.046930


In [19]:
#Adding sentiment score to news
sia = SentimentIntensityAnalyzer()


news_df['Compound'] = [sia.polarity_scores(v)['compound'] for v in news_df['News']]
news_df['Negative'] = [sia.polarity_scores(v)['neg'] for v in news_df['News']]
news_df['Neutral'] = [sia.polarity_scores(v)['neu'] for v in news_df['News']]
news_df['Positive'] = [sia.polarity_scores(v)['pos'] for v in news_df['News']]
news_df

Unnamed: 0,Date,News,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,2001-01-02,Status quo will not be disturbed at Ayodhya s...,0.286859,0.143590,-0.9792,0.121,0.809,0.071
1,2001-01-03,Powerless north India gropes in the dark Think...,0.392857,0.089286,-0.8910,0.156,0.735,0.109
2,2001-01-04,The string that pulled Stephen Hawking to Indi...,0.445360,0.093039,0.7543,0.104,0.792,0.104
3,2001-01-05,Light combat craft takes India into club class...,0.480553,0.264024,0.9365,0.142,0.696,0.161
4,2001-01-06,Light combat craft takes India into club class...,0.439394,0.248485,-0.8316,0.214,0.655,0.131
...,...,...,...,...,...,...,...,...
7257,2020-12-27,BigInterview Dhritiman Chatterjee Nobody da...,0.392082,0.042978,-0.9997,0.129,0.793,0.079
7258,2020-12-28,Horoscope Today December Check astro...,0.409973,0.071405,-0.9998,0.142,0.761,0.097
7259,2020-12-29,Man recovers charred remains of 'thief' from h...,0.415684,0.060775,-0.9999,0.151,0.753,0.096
7260,2020-12-30,Numerology Readings December Predicti...,0.436863,0.046930,-0.9999,0.146,0.770,0.084


Merge the Historical and Textual Data

In [20]:
df_merge = pd.merge(S_df, news_df, how='inner', on='Date')
df_merge

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,News,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,2010-01-04,17473.449219,17582.839844,17378.380859,17558.730469,28200,0,0,Bebo Khan't stop loving Chor Bazaari top song ...,0.462011,0.050055,-0.9994,0.154,0.739,0.107
1,2010-01-05,17555.769531,17729.779297,17555.769531,17686.240234,27000,0,0,Women say no to one night stands MJ's new trac...,0.412185,0.009012,-0.9996,0.144,0.781,0.075
2,2010-01-06,17719.470703,17790.330078,17636.710938,17701.130859,21400,0,0,Are ladies really THIS safe on Nagpur's buses ...,0.433073,0.051819,-0.9995,0.149,0.753,0.098
3,2010-01-07,17701.970703,17733.339844,17566.539062,17615.720703,18000,0,0,Dev Patel meets the Pintos Rahman's going gree...,0.408312,0.041015,-0.9998,0.178,0.716,0.105
4,2010-01-08,17603.869141,17658.119141,17508.960938,17540.289062,17200,0,0,Points table Bicholim fire cell saved lives...,0.378408,0.033051,-0.9999,0.180,0.724,0.096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2696,2020-12-24,46743.488281,47053.398438,46539.019531,46973.539062,13700,0,0,How to set the mood for sex during cold winter...,0.387011,0.053569,-0.9985,0.094,0.836,0.070
2697,2020-12-28,47153.589844,47406.718750,47148.238281,47353.750000,9600,0,0,Horoscope Today December Check astro...,0.409973,0.071405,-0.9998,0.142,0.761,0.097
2698,2020-12-29,47466.621094,47714.550781,47361.898438,47613.078125,12800,0,0,Man recovers charred remains of 'thief' from h...,0.415684,0.060775,-0.9999,0.151,0.753,0.096
2699,2020-12-30,47789.031250,47807.851562,47358.359375,47746.218750,15600,0,0,Numerology Readings December Predicti...,0.436863,0.046930,-0.9999,0.146,0.770,0.084


Create Dataset for Model Training

In [47]:
dfmerge = df_merge[['Close','Subjectivity', 'Polarity', 'Compound', 'Negative', 'Neutral', 'Positive']]
dfmerge1=dfmerge.copy()
dfmerge1.head()

Unnamed: 0,Close,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,17558.730469,0.462011,0.050055,-0.9994,0.154,0.739,0.107
1,17686.240234,0.412185,0.009012,-0.9996,0.144,0.781,0.075
2,17701.130859,0.433073,0.051819,-0.9995,0.149,0.753,0.098
3,17615.720703,0.408312,0.041015,-0.9998,0.178,0.716,0.105
4,17540.289062,0.378408,0.033051,-0.9999,0.18,0.724,0.096


In [48]:
scaler = MinMaxScaler()

dfmerge1 = pd.DataFrame(scaler.fit_transform(dfmerge1))
dfmerge1.columns = dfmerge.columns
dfmerge1.head()

Unnamed: 0,Close,Subjectivity,Polarity,Compound,Negative,Neutral,Positive
0,0.073171,0.768548,0.358207,0.0003,0.492647,0.412698,0.45082
1,0.077086,0.476872,0.11993,0.0002,0.419118,0.634921,0.188525
2,0.077543,0.599145,0.36845,0.00025,0.455882,0.486772,0.377049
3,0.074921,0.454195,0.305724,0.0001,0.669118,0.291005,0.434426
4,0.072605,0.279145,0.259492,5e-05,0.683824,0.333333,0.360656


In [49]:
X=dfmerge1.drop('Close',axis=1)
y=dfmerge1['Close']

In [50]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

Random Forest Regressor

In [51]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
prediction=rf.predict(X_test)

In [52]:
print(prediction[:10])
print(y_test[:10])
print('Mean Squared error: ',mean_squared_error(prediction,y_test))

[0.12910538 0.38383745 0.22822389 0.32270705 0.38222651 0.18358077
 0.37763033 0.61651534 0.4384107  0.33845131]
92      0.041275
2133    0.705509
895     0.105080
1721    0.355601
1300    0.385555
868     0.127384
702     0.109907
2494    0.719807
861     0.113606
1032    0.194583
Name: Close, dtype: float64
Mean Squared error:  0.04984892840704085


DecisionTreeRegressor

In [55]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
predictions = dtr.predict(X_test)

In [56]:
print(predictions[:10])
print(y_test[:10])
print('Mean Squared error: ',mean_squared_error(predictions,y_test))

[0.08125186 0.22136341 0.16856055 0.09628211 0.625017   0.28486887
 0.00970859 0.66191206 0.64912967 0.41399795]
92      0.041275
2133    0.705509
895     0.105080
1721    0.355601
1300    0.385555
868     0.127384
702     0.109907
2494    0.719807
861     0.113606
1032    0.194583
Name: Close, dtype: float64
Mean Squared error:  0.09182795810959156


AdaBoostRegressor

In [57]:
adb = AdaBoostRegressor()
adb.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

In [58]:
predictions = adb.predict(X_test)
print(mean_squared_error(predictions, y_test))

0.05466456730168858


Light GBM

In [59]:
gbm = lightgbm.LGBMRegressor()
gbm.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [60]:
predictions = gbm.predict(X_test)
print(mean_squared_error(predictions, y_test))

0.04977115504616173


Xgboost

In [61]:
xgb = xgboost.XGBRegressor()
xgb.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [62]:
predictions = xgb.predict(X_test)
print(mean_squared_error(predictions, y_test))

0.04832368478308566


In [63]:
#hyperparameter tuning
params={
    'learning_rate'   : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'max_depth'       : [3,4,5,6,8,10,12],
    'min_child_weight': [1,3,5,7],
    'gamma'           : [0.0, 0.1,0.2,0.3,0.4]
    
}

In [64]:
Xgreg=xgboost.XGBRegressor()
Xgreg=RandomizedSearchCV(Xgreg,param_distributions=params,cv=10,verbose=False)
Xgreg.fit(X_train,y_train)



RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha=0,
                                          reg_lambda=1, scale_pos_weight=1,
                                          seed=None, silent=None, subsample=1,
                                 

In [65]:
predictions = Xgreg.predict(X_test)
print(mean_squared_error(predictions, y_test))

0.04876030512877993


## Conclusion

I was able to create a hybrid model for stock price/performance prediction using numerical analysis of historical stock prices, and sentimental analysis of news headlines.¶