In [1]:
!pip install yfinance
!pip install pyspark
!pip install findspark
!pip install dateparser
!pip install vaderSentiment
import pandas as pd
import yfinance as yf
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup as bs
import requests
from pyspark.sql.functions import sum,max,min,mean,count
import datetime as dt
import pyspark
from pyspark.sql import SparkSession
import pyspark.pandas as ps
import findspark

import yaml
from yaml.loader import SafeLoader
from os.path import abspath

warehouse_location = abspath('spark-warehouse')
with open('cfg.yml') as f:
    config = yaml.load(f, Loader = SafeLoader)

findspark.init()
spark = SparkSession.builder \
    .master(config['spark']['spark_master'])\
    .appName('gather')\
    .enableHiveSupport()\
    .config('spark.sql.warehouse.dir', warehouse_location)\
    .config(config['spark']['spark_jars'], config['spark']['spark_jars_path'])\
    .config('spark.cores.max', '2')\
    .config('spark.executor.cores', '2')\
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")
spark 









RuntimeError: Java gateway process exited before sending its port number

In [None]:
url = config['postgres']['url']
props = {
    'user': config['postgres']['user'],
    'password' : config['postgres']['user'],
    'url': url,
    'driver': config['postgres']['driver']
    
}

In [None]:
#retrieve headlines from financial post
headers = {'User-Agent':
	'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0'}
def gather_headlines(company_name, ticker):
    headlines = []
    dates = []
    for i in range(10, 30000, 10):    # Running for-loop
        info_url = "https://financialpost.com/search/?search_text="+company_name +"&date_range=-3650d&sort=asc&from="+str(i)
        page = requests.get(info_url, headers = headers)
        parser = bs(page.content, "html.parser" )
        date = parser.body.find_all('div', attrs={'class': 'article-card__meta-bottom'})
        for span in date:
            dates.append(span.text.split("   ")[1])
        headline = parser.body.find_all('h3', class_ = 'article-card__headline text-size--extra-large--sm-up')
        for x in headline:
            headlines.append(x.text)
    dates = dates[:len(headlines)]
    file = {'date' : dates, "headline" : headlines}
    file = pd.DataFrame(file)
    print(file.head())
    file['ticker'] = ticker
    return file

#calculate sentiment scores for each headlines and append to dataset
def analyze_sent(df):
    analyze_obj = SentimentIntensityAnalyzer()
    df['sentiment']=df['headline'].apply(lambda headline: analyze_obj.polarity_scores(str(headline))['compound'])
    df.fillna(0, inplace = True)
    return df

def final_sentiment(df):
    return df.withColumn("sent_score", df.mean_sentiment*(df.headline_count**2)).drop('headline', 'headline_count', 'mean_sentiment')

In [None]:
import dateparser
ticker_list = ['MSFT', 'AMZN']
company_list = ['microsoft', 'amazon']


def process_headlines(ticker_list, company_list):
    dfs = []
    for tick, company in zip(ticker_list, company_list):
        data = gather_headlines(company, tick)
        dfs.append(data)
    full_df = pd.concat(dfs)
    dates = []
    for index, row in full_df.iterrows():
        date = dateparser.parse(row['date'], date_formats = ["%d-%m-%y"])
        dates.append(date.date())
    full_df['date'] = dates
    full_df = ps.from_pandas(full_df)
    print(full_df.head())
    full_df = analyze_sent(full_df)
    full_df = full_df.to_spark()
    full_df.show()
    aggregated = full_df.groupBy('date', 'ticker').agg(count('headline').alias('headline_count'), mean('sentiment').alias("mean_sentiment"))
    final_news = final_sentiment(aggregated) 
    final_news.write.format("jdbc")\
        .option("url", "jdbc:postgresql://localhost:5432/financials") \
        .option("driver", "org.postgresql.Driver").option("dbtable", "sentiment") \
        .option("user", "adam").option("password", "green").mode('append').save()

process_headlines(ticker_list, company_list)

             date                                           headline
0   April 9, 2013   The Double Dragon II remake is so bad you sho...
1   April 9, 2013   Microsoft, Nokia file EU antitrust complaint ...
2  April 10, 2013   Canadian students show off their games at Lev...
3  April 10, 2013   Personal computer shipments shrink 14% in wor...
4  April 11, 2013         What you need to know before markets open 
             date                                           headline
0  April 24, 2013   Kindle TV? Amazon said to plan set-top box fo...
1  April 25, 2013         CIPO's Amazon guidelines raise new issues 
2  April 25, 2013   Amazon beats estimates as investments in digi...
3  April 25, 2013   Shoppers Drug Mart ups pharmacy market share ...
4  April 26, 2013   Amazon shares fall in biggest decline since F...


                                                                                

         date                                                                                   headline ticker
0  2013-04-09   The Double Dragon II remake is so bad you shouldn't even waste time reading this review    MSFT
1  2013-04-09                          Microsoft, Nokia file EU antitrust complaint over Google Android    MSFT
2  2013-04-10                                   Canadian students show off their games at Level Up 2013    MSFT
3  2013-04-10                              Personal computer shipments shrink 14% in worst-ever decline    MSFT
4  2013-04-11                                                 What you need to know before markets open    MSFT


                                                                                

+----------+--------------------+------+---------+
|      date|            headline|ticker|sentiment|
+----------+--------------------+------+---------+
|2013-04-09| The Double Drago...|  MSFT|   -0.487|
|2013-04-09| Microsoft, Nokia...|  MSFT|   -0.296|
|2013-04-10| Canadian student...|  MSFT|      0.0|
|2013-04-10| Personal compute...|  MSFT|      0.0|
|2013-04-11| What you need to...|  MSFT|      0.0|
|2013-04-11| Microsoft's Wind...|  MSFT|  -0.7579|
|2013-04-11| Microsoft falls ...|  MSFT|  -0.4767|
|2013-04-11| Electronic Arts ...|  MSFT|      0.0|
|2013-04-12| 4.12.13: BlackBe...|  MSFT|      0.0|
|2013-04-12| Motocross Madnes...|  MSFT|  -0.4939|
|2013-04-12| Who says account...|  MSFT|  -0.3182|
|2013-04-15| 4.15.13: Gold an...|  MSFT|      0.0|
|2013-04-15| Microsoft smartw...|  MSFT|      0.0|
|2013-04-16| Facebook, Apple ...|  MSFT|      0.0|
|2013-04-16| Facebook Home se...|  MSFT|      0.0|
|2013-04-17| Buying defensive...|  MSFT|  -0.1531|
|2013-04-17| 4.17.13: Stickin..

                                                                                

23/03/26 02:45:24 WARN HeartbeatReceiver: Removing executor 0 with no recent heartbeats: 28705741 ms exceeds timeout 120000 ms
23/03/26 02:45:25 ERROR TaskSchedulerImpl: Lost executor 0 on 10.0.2.15: worker lost


In [None]:
def get_financials(ticker, start):
    time_delt = dt.timedelta(days = 150)
    start_day = start - time_delt
    data = yf.download(str(ticker), start_day)
    data['ticker'] = ticker
    data = data.reset_index()
    data = data.rename(columns = {'Date':'date', 'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Adj Close': 'adj_close', 'Volume':'volume'})
    print('success!')
    return data
                       
                       
def EWMA(data, ndays): 
    EMA = pd.Series(data['close'].ewm(span = ndays, min_periods = ndays - 1).mean(), 
                 name = 'EWMA_' + str(ndays)) 
    data = data.join(EMA) 
    return data

def rsi(close, periods = 14):
    
    close_delta = close.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()

    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

def BBANDS(data, window):
    MA = data.close.rolling(window).mean()
    SD = data.close.rolling(window).std()
    data['MiddleBand'] = MA
    data['UpperBand'] = MA + (2 * SD) 
    data['LowerBand'] = MA - (2 * SD)
    return data

def prep_financials(df):
    df = pd.DataFrame(df)
    #df.set_index('date')
    df['target'] = (df['close'].shift(-1))
    df['tenmda'] = df['close'].rolling(10).mean()
    df['twentymda'] = df['close'].rolling(20).mean()
    df['fiftymda'] = df['close'].rolling(50).mean()
    df['hundredmda'] = df['close'].rolling(100).mean()
    df = EWMA(df, 20)
    df = EWMA(df, 50) 
    df = EWMA(df, 100)
    df['rsi'] = rsi(df['close'])
    df = BBANDS(df, 40)
    df.dropna(inplace = True)
    df.reset_index()
    print(df.head())
    return df

In [None]:
def process_finance(ticker_list):
    finance_dfs = []
    for tick in ticker_list:
        data = get_financials(tick, dt.date(2015,1, 1))
        data = prep_financials(data)
        finance_dfs.append(data)
    final_finance = pd.concat(finance_dfs)
    final_finance = spark.createDataFrame(final_finance)
    final_finance.write.format("jdbc")\
        .option("url", "jdbc:postgresql://localhost:5432/financials") \
        .option("driver", "org.postgresql.Driver").option("dbtable", "company_data") \
        .option("user", "adam").option("password", "green").mode('append').save()
    
ticker_list = ['NFLX','AMZN', 'TSLA']
process_finance(ticker_list)

[*********************100%***********************]  1 of 1 completed
success!
          date       open       high        low      close  adj_close  \
99  2014-12-23  48.287144  48.428570  47.472858  48.061428  48.061428   
100 2014-12-24  48.072857  49.070000  47.998569  48.871429  48.871429   
101 2014-12-26  48.844288  49.484287  48.534286  48.578571  48.578571   
102 2014-12-29  47.970001  49.104286  47.652859  48.847141  48.847141   
103 2014-12-30  48.714287  49.139999  48.538570  49.032856  49.032856   

      volume ticker     target     tenmda  twentymda   fiftymda  hundredmda  \
99   8291500   NFLX  48.871429  47.545857  48.635500  52.163457   58.913929   
100  5411000   NFLX  48.578571  47.657000  48.586286  51.857686   58.798786   
101  8847300   NFLX  48.847141  47.734428  48.506928  51.547571   58.680500   
102  8588300   NFLX  49.032856  47.840857  48.473643  51.491086   58.554257   
103  7011200   NFLX  48.801430  48.072142  48.483785  51.451486   58.402200   

       E

                                                                                

In [None]:
spark.stop()

NameError: name 'spark' is not defined