# Text Scraping
***

## 1. Imports and inputs
### 1.1 Import

In [1]:
import datetime
import api as api
import numpy as np
import requests
import tweepy
import pandas as pd
import psycopg2
from bs4 import BeautifulSoup
from stocknews import StockNews
import requests
from bs4 import BeautifulSoup
import uuid
print('imported')

imported


### 1.2 Basic functions
 - Read names of companies

In [2]:
def get_tickers():
    tickers = []
    try:
        connection = get_connection()
        cursor = connection.cursor()
        select_query = """select ticker from companies"""
        cursor.execute(select_query,)
        records = cursor.fetchall()
        for row in records:
            tickers.append(row[0])
        close_connection(connection)
    except (Exception, psycopg2.Error) as error:
        print("Error while getting data", error)
    return tickers

def get_connection():
    connection = psycopg2.connect(user = "postgres",
                                  password = "postgres",
                                  host = "localhost",
                                  port = "5432",
                                  database = "postgres")
    return connection

def close_connection(connection):
    if connection:
        connection.close()
        print("Postgres connection is closed")

***
## 2. Scrape tweets

###  2.1 Load Keys

In [3]:
oauth_keys = [
    ["dfc6Wf3dTENuTlXDV1f9hml6B", "50vjNuiW6V5rZSRWvCyZy9RSY4h134Y9CjHHUtaxKbzqCdnn6D", "1359361622-LwyudniQqFogVrRXwoerQCSugRmU95nYKpWPrfS", "4nSO0bMINyisgrRfZgOD4SbAUOVcgC7BRq1N9j2AdTq5U"],
    ["lZkQXF2uZgljsr84A9ZnToFrS", "1fSeS1NRnSJW5rLV3snOg2NJOAVSzymKBumEbS40Lo4cge1Hwu", "1299631912659623936-ti2P7XgEfZkBdoOwlveFILCTOgShKG", "COffmtis6f3hbTyxbKOrNDfOUAygjoubpex4ytaXXsJzo"],
    ["lUwGGzNTUOiO16pzmxCLFrgCj", "FLH2utmpJ9y0HLQDcdlKKeGl4ZheY6s7osa3QzUGSYJGL1TkWt", "1299630171419533313-Knb2DgpfxEhENpqKkGvaTGXDwOFMhQ", "q0UvNWfxccp9AuCjGuxpBbBKbvSsywZ6Cbsk3hVDzhE1X"]
    ]

auths = []
for consumer_key, consumer_secret, access_key, access_secret in oauth_keys:
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    auths.append(auth)

### 2.2 Write to DB

In [4]:
def write_tweets(df):
    try:
        connection = get_connection()
        cursor = connection.cursor()
        postgres_insert_query = """ INSERT INTO tweets (ticker , text, createdat , tweetid , coordinates, userid , userfollowers) VALUES (%s,%s,%s,%s,%s,%s,%s)"""
        for i in range(len(df.Ticker)):
            record_to_insert = (df['Ticker'][i], df['tweet.text'][i], df['tweet.created_at'][i], str(df['tweet.id_str'][i]), str(df['tweet.coordinates'][i]), str(df['tweet.user.id'][i]), str(df['tweet.user.followers_count'][i]))
            cursor.execute(postgres_insert_query,record_to_insert)
            connection.commit()
        close_connection(connection)
    except (Exception, psycopg2.Error) as error:
        print("Error while getting data ", error)

### 2.3 Scrape tweets

In [5]:
def scrape_tweets_main():
#     for ticker in get_tickers()[::-1]:
    for ticker in get_tickers():
        auth = auths[0]
        api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
        max_tweets = 200

        # Creation of query method using parameters
        yesteday = datetime.date.today() + datetime.timedelta(-1)
        last_closing_time = datetime.datetime(yesteday.year, yesteday.month,yesteday.day, 22,0,0)
        text_query = str(ticker) + ' -filter:retweets'
        tweets = tweepy.Cursor(api.search,q=ticker, until = last_closing_time, tweet_mode = 'extended', lang = 'en').items(max_tweets)

        # Pulling information from tweets iterable object
        # Add or remove tweet information in the below list comprehension
        tweets_list = [[tweet.full_text, tweet.created_at, tweet.id_str, tweet.coordinates, tweet.user.id, tweet.user.followers_count] for tweet in tweets]

         #Check if tweet_list is not empty otherwise scrape next ticker
        if(len(tweets_list) != 0):
            # Creation of dataframe from tweets_list
            tweets_df = pd.DataFrame(tweets_list)
            tweets_df.columns = ['tweet.text', 'tweet.created_at', 'tweet.id_str','tweet.coordinates', 'tweet.user.id', 'tweet.user.followers_count']
            tweets_df['Ticker'] = ticker
            write_tweets(tweets_df)

## 3. Scrape Yahoo finance stock news
 - Use package stocknews,
 which extracts news according given symbols and calculates the average sentiment of the summary and title

### 3.1 Scrape Yahoo finance with package

In [6]:
def scrape_news_main():
    stocks = get_tickers()
    sn = StockNews(stocks, wt_key='MY_WORLD_TRADING_DATA_KEY')
    tuple = sn.summarize()
    df =pd.DataFrame(tuple)[0][0]
    write_news(df= df)



def write_news(df):
    try:
        connection = get_connection()
        cursor = connection.cursor()
        postgres_insert_query = """ INSERT INTO news (id, stock , news_dt, check_day , sentiment_summary_avg , sentiment_title_avg, sentiment_summary_med , sentiment_title_med) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"""
        for i in range(len(df.id)):
            record_to_insert = (df['id'][i], df['stock'][i], df['news_dt'][i], df['check_day'][i], df['sentiment_summary_avg'][i], df['sentiment_title_avg'][i], df['sentiment_summary_med'][i], df['sentiment_title_med'][i])
            cursor.execute(postgres_insert_query,record_to_insert)
            connection.commit()
        close_connection(connection)
    except (Exception, psycopg2.Error) as error:
        print("Error while getting data ", error)


### 3.2 Scrape YF with BS4
#### 3.2.1 Page 1

In [7]:
def write_yf_news(df):
    try:
        connection = get_connection()
        cursor = connection.cursor()
        postgres_insert_query = """ INSERT INTO yf_news (id, title , text, date ) VALUES (%s,%s,%s,%s)"""
        for i, val in enumerate(df.check_day):
            id = uuid.uuid1()
            record_to_insert = ( id.int , str(df["title"][i]), str(df["text"][i]), datetime.datetime.today())
            cursor.execute(postgres_insert_query,record_to_insert)
            connection.commit()
        close_connection(connection)
    except (Exception, psycopg2.Error) as error:
        print("Error while getting data ", error)



def scrape_yf():

    url = 'https://finance.yahoo.com/news/'
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html)

    table = soup.find('div', {"id": "Fin-Stream"})
    rows = table.find_all('li')
    data = []
    for row in rows:
        title = row.find_all('a')
        title =  [ele.text.strip() for ele in title]
        cols = row.find_all('p')
        cols = [ele.text.strip() for ele in cols]
        data.append([title, [ele for ele in cols if ele]])

    df = pd.DataFrame(data)
    df['id'] = id = uuid.uuid1()
    df['check_day'] = datetime.datetime.today().day
    df.rename(columns={0: 'title', 1: 'text'}, inplace = True)
    write_yf_news(df)

#### 3.1.2 Page 2

In [8]:
def scrape_yf2():
    url = 'https://news.yahoo.com/'
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html)

    table = soup.find('ul', {"class": "My(0) Ov(h) P(0) Wow(bw)"})
    rows = table.find_all('li')
    data = []
    for row in rows:
        title = row.find_all('a')
        title =  [ele.text.strip() for ele in title]
        cols = row.find_all('p')
        cols = [ele.text.strip() for ele in cols]
        data.append([title, [ele for ele in cols if ele]])

    df = pd.DataFrame(data)
    df['id'] = id = uuid.uuid1()
    df['check_day'] = datetime.datetime.today().day
    df.rename(columns={0: 'title', 1: 'text'}, inplace = True)
    write_yf_news(df)



# RUN DAILY

In [9]:
print('Scrape tweets')
scrape_tweets_main()

print('\n Scrape Yahoo Finance stock news')
scrape_news_main()

scrape_yf()
scrape_yf2()

print('\n Done for today ({})'.format(datetime.datetime.today()))


Scrape tweets
Postgres connection is closed
Error while getting data  duplicate key value violates unique constraint "tweets_pk"
DETAIL:  Key (tweetid, ticker)=(1340809010971328513, AAPL) already exists.

Error while getting data  duplicate key value violates unique constraint "tweets_pk"
DETAIL:  Key (tweetid, ticker)=(1340809010971328513, MSFT) already exists.

Error while getting data  duplicate key value violates unique constraint "tweets_pk"
DETAIL:  Key (tweetid, ticker)=(1340809010971328513, AMZN) already exists.

Error while getting data  duplicate key value violates unique constraint "tweets_pk"
DETAIL:  Key (tweetid, ticker)=(1340102095584677888, BABA) already exists.

Error while getting data  duplicate key value violates unique constraint "tweets_pk"
DETAIL:  Key (tweetid, ticker)=(1340784127084273664, FB) already exists.

Error while getting data  duplicate key value violates unique constraint "tweets_pk"
DETAIL:  Key (tweetid, ticker)=(1340809180190486535, V) already exis