In [9]:
#IMPORT LIBRARIES

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
nltk.download('gutenberg') #needed to access the raw text of a book
nltk.download('punkt') #needed to tokenize sentences
nltk.download('vader_lexicon') #NLTK's vader tool relies on a sentiment lexicon!

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Load numeric data from Sheet1
num_df = pd.read_excel('Proj2Data.xlsx', sheet_name='Numeric Data')
num_df.head(10)

Unnamed: 0,day,open_price,high_price,low_price,moving_average_5_day,moving_average_10_day,moving_average_50_day,moving_average_200_day,volume,next_day_close_price
0,1,44.55,44.83,44.32,44.36,43.82,40.23,42.25,1500700,44.83
1,2,44.89,45.12,44.17,44.37,44.06,40.32,42.27,2391800,45.03
2,3,45.04,45.35,44.84,44.57,44.21,40.42,42.27,1723400,44.97
3,4,45.02,45.15,44.7,44.65,44.4,40.5,42.28,1490500,44.91
4,5,44.75,45.04,44.65,44.76,44.58,40.59,42.28,1349500,45.31
5,6,45.04,45.35,44.53,44.88,44.62,40.69,42.29,1707700,45.66
6,7,45.24,45.83,45.24,45.01,44.69,40.78,42.3,2389800,45.49
7,8,45.85,46.2,45.48,45.18,44.87,40.87,42.31,3256800,45.21
8,9,45.38,45.6,45.17,45.27,44.96,40.97,42.31,1955500,44.93
9,10,45.31,45.44,44.91,45.32,45.04,41.06,42.32,2211100,44.83


In [3]:
# Load text data (tweets) from Sheet2
text_df = pd.read_excel('Proj2Data.xlsx', sheet_name='Tweets')
text_df.head(10)

Unnamed: 0,day,tweet
0,1,"#Dan ($Dan) Doubles Down on Healthy, Eco-Frien..."
1,1,RT @DvdndDiplomats: Bert's X Always Buy stocks...
2,1,$Dan Alert From our Stock News Alerts App
3,1,X NEW Stocks at #FusionIQ with Master Scores >...
4,1,"#AmazonPrime creates a captive audience, so ""b..."
5,1,$Dan on watch if this pulls back I will be in....
6,1,Weekly S&P100 #Stocks Trend $Dan @
7,1,"How #Dan is upgrading its #healthy, #eco-frien..."
8,1,Why Big #Retailers Are Going Solar: $Dan
9,1,Dan Co. $Dan Given Buy Rating at Piper Jaffray


In [None]:
# Tweet Data Cleaning


In [16]:
#USING TEXT BLOB
#define a few collections to hold polarity and subjectivity scores for each sentence
polarities = []
subjectivities = []

#get polarity and subjectivity scores for each sentence
for sentence in text_df.tweet:
  #the str() function is used to ensure that the input value is a string
  blob = TextBlob(str(sentence)) 
  #get polarity and subjectivity scores
  polarity, subjectivity = blob.polarity, blob.subjectivity
  #print('sentence: "{}", polarity: {:.3f}, subjectivity: {:.3f}'.format(sentence, polarity, subjectivity))
  polarities.append(polarity)
  subjectivities.append(subjectivity)

#calculate and display average polarity and subjectivity scores for all sentences
print('average polarity: {:.3f}'.format(np.mean(polarities)))
print('average subjectivity: {:.3f}'.format(np.mean(subjectivities)))


average polarity: 0.059
average subjectivity: 0.269


In [19]:
#USING NLTK
#define a sentiment analyzer object
analyzer = SentimentIntensityAnalyzer()

#apply sentiment analysis to each tweet in the dataframe and calculate average scores
text_df['sentiment_scores'] = text_df['tweet'].apply(lambda tweet: analyzer.polarity_scores(str(tweet)))


#extract individual sentiment scores
text_df['neg_score'] = text_df['sentiment_scores'].apply(lambda score: score['neg'])
text_df['neu_score'] = text_df['sentiment_scores'].apply(lambda score: score['neu'])
text_df['pos_score'] = text_df['sentiment_scores'].apply(lambda score: score['pos'])
text_df['compound_score'] = text_df['sentiment_scores'].apply(lambda score: score['compound'])

#calculate average scores
average_scores = text_df[['neg_score', 'neu_score', 'pos_score', 'compound_score']].mean()

#print average sentiment scores
print('Average sentiment scores - negative: {:.3f}, neutral: {:.3f}, positive: {:.3f}, overall: {:.3f}'.format(
    average_scores['neg_score'], average_scores['neu_score'], average_scores['pos_score'], average_scores['compound_score']))


Average sentiment scores - negative: 0.036, neutral: 0.872, positive: 0.092, overall: 0.125


In [13]:
num_df.info()
print("\n")
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300 entries, 0 to 1299
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   day                     1300 non-null   int64  
 1   open_price              1300 non-null   float64
 2   high_price              1300 non-null   float64
 3   low_price               1300 non-null   float64
 4   moving_average_5_day    1300 non-null   float64
 5   moving_average_10_day   1300 non-null   float64
 6   moving_average_50_day   1300 non-null   float64
 7   moving_average_200_day  1300 non-null   float64
 8   volume                  1300 non-null   int64  
 9   next_day_close_price    1000 non-null   float64
dtypes: float64(8), int64(2)
memory usage: 101.7 KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100767 entries, 0 to 100766
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   day     100767

In [14]:
num_df.describe()

Unnamed: 0,day,open_price,high_price,low_price,moving_average_5_day,moving_average_10_day,moving_average_50_day,moving_average_200_day,volume,next_day_close_price
count,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1000.0
mean,650.5,51.764592,52.318354,51.232192,51.604431,51.469623,50.392154,47.170069,2325961.0,42.26556
std,375.421985,21.392828,21.626325,21.194475,21.167761,20.969752,19.489467,14.551042,1636736.0,9.833662
min,1.0,27.63,27.93,26.71,27.94,28.3,29.56,30.98,643800.0,27.6
25%,325.75,37.81,38.175,37.44,37.87,37.8475,38.6875,37.965,1474500.0,36.5375
50%,650.5,43.115,43.575,42.82,43.03,42.89,42.54,42.355,1940300.0,40.175
75%,975.25,61.7325,62.525,61.075,61.3325,61.285,60.01,54.9825,2696550.0,45.52
max,1300.0,117.51,117.66,116.57,115.49,114.07,107.12,94.5,20786500.0,71.07


In [18]:
#create a close_price column (to train todays next day close is tomorrows close)
num_df["close"] = num_df["next_day_close_price"].shift(+1)
#target to buy if 1 coz it means profit (tomorrows price will be greater than todays)
num_df["Target"] = (num_df["next_day_close_price"] > num_df["close"]).astype(int)

num_df.head(10)

Unnamed: 0,day,open_price,high_price,low_price,moving_average_5_day,moving_average_10_day,moving_average_50_day,moving_average_200_day,volume,next_day_close_price,close,Target
0,1,44.55,44.83,44.32,44.36,43.82,40.23,42.25,1500700,44.83,,0
1,2,44.89,45.12,44.17,44.37,44.06,40.32,42.27,2391800,45.03,44.83,1
2,3,45.04,45.35,44.84,44.57,44.21,40.42,42.27,1723400,44.97,45.03,0
3,4,45.02,45.15,44.7,44.65,44.4,40.5,42.28,1490500,44.91,44.97,0
4,5,44.75,45.04,44.65,44.76,44.58,40.59,42.28,1349500,45.31,44.91,1
5,6,45.04,45.35,44.53,44.88,44.62,40.69,42.29,1707700,45.66,45.31,1
6,7,45.24,45.83,45.24,45.01,44.69,40.78,42.3,2389800,45.49,45.66,0
7,8,45.85,46.2,45.48,45.18,44.87,40.87,42.31,3256800,45.21,45.49,0
8,9,45.38,45.6,45.17,45.27,44.96,40.97,42.31,1955500,44.93,45.21,0
9,10,45.31,45.44,44.91,45.32,45.04,41.06,42.32,2211100,44.83,44.93,0


In [28]:
#Missing Values or Nan's:
num_df.isnull().sum()

day                         0
open_price                  0
high_price                  0
low_price                   0
moving_average_5_day        0
moving_average_10_day       0
moving_average_50_day       0
moving_average_200_day      0
volume                      0
next_day_close_price      300
close                     300
Target                      0
dtype: int64

In [39]:
#create a dataframe containing only those rows for which predictions need to be made
#(predictions for the next day’s closing price)
df_predict = num_df[pd.isnull(num_df.next_day_close_price) == True].copy()
df_predict.head(10)

#remove all incomplete rows from the 'df' dataframe
Ndf = num_df[pd.isnull(num_df.next_day_close_price) == False].copy()
Ndf.close[0] = 44.58
Ndf.head(3)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Ndf.close[0] = 44.58


Unnamed: 0,day,open_price,high_price,low_price,moving_average_5_day,moving_average_10_day,moving_average_50_day,moving_average_200_day,volume,next_day_close_price,close,Target
0,1,44.55,44.83,44.32,44.36,43.82,40.23,42.25,1500700,44.83,44.58,0
1,2,44.89,45.12,44.17,44.37,44.06,40.32,42.27,2391800,45.03,44.83,1
2,3,45.04,45.35,44.84,44.57,44.21,40.42,42.27,1723400,44.97,45.03,0


In [40]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)

train = Ndf
test = df_predict

predictors = ["day", "close", "open_price", "high_price", "low_price", "volume", "moving_average_5_day", "moving_average_50_day", "moving_average_10_day", "moving_average_200_day"]
model.fit(train[predictors], train["next_day_close_price"])


ValueError: ignored

In [21]:
from sklearn.metrics import precision_score

preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)
precision_score(test["Target"], preds)

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [None]:
#save prediction to a CSV file
df_predict[['day', 'next_day_close_price']].to_csv('Jhobalia+Kelly+Jain, Pooja+Hanna+Jayanti.csv', index=False)