In [None]:
import json
import requests
from bs4 import BeautifulSoup

class Wrapper:
    """
        Wrapper to return a ticker, based on ISIN input.
        Data from Morningstar.com
    """

    # If True prints some basic feedback from the Wrapper
    debug = False

    def __init__(self, ISIN=''):
        self.ISIN = ISIN.upper()

        if self.ISIN:
            # Opens search URL and returns a Requests-object
            r = self.startConnection()

            # Creates soup and finds the correct result data based on a request-object
            self.makeSoup(r)

    def setISIN(self, ISIN):
        """ Sets the value of ISIN, it's required to set this, either in the __init__ or with this function """
        self.ISIN = ISIN.upper()

    def startConnection(self):
        """ Stars the connection to the search URL and returns a request-object if succeeded """
        PREFIX = 'http://www.morningstar.com/search.html?q='
        search_url = f'{PREFIX}+{self.ISIN}'
        if self.debug:
            print(search_url)
        r = requests.get(search_url)
        if r.status_code == 200:
            return r
        else:
            return None

    def makeSoup(self, request):
        """ Creates a BS object and filters the data we need """
        soup = BeautifulSoup(request.content, 'html.parser')

        for div in soup.find_all('div'):

            # Limit our search to the search results
            if div.get('class') == ['search-list-content']:

                # Check if we  found correct result, containing our ISIN
                if div.get('data-key').strip() == self.ISIN:

                    self.data = div.get('data-initialdata')
                    break

    def getTicker(self):
        """ Returns a ticker based on the filtered data, which is collected in makeSoup() """
        if self.debug:
            print('DATA: ' + self.data)

        myJSON = json.loads(self.data)

        if self.debug:
            print('JSON: ' + str(myJSON))
            print('Result part: ' + str(myJSON['result']))

        new_dict = myJSON['m']

        if self.debug:
            print('NEW DICT: ', end='')
            print(list(new_dict))

        # The ticker is part of a large string, followed by the OS001-tag.
        pos_tag = str(new_dict).find('OS001')
        pos_comma = str(new_dict)[pos_tag:].find(',')

        ticker = str(new_dict)[pos_tag + 9:pos_tag + pos_comma - 1]
        return ticker

## Обработка изначальных данных

*Источник: https://www.ssga.com/dk/en_gb/institutional/etfs/funds/spdr-sp-500-esg-leaders-ucits-etf-acc-sppy-gy*

In [1]:
import pandas as pd
import numpy as np

In [2]:
companies_info = pd.read_excel(r'Data/holdings-daily-emea-en-sppy-gy.xlsx')
companies_info.drop('SEDOL', axis=1, inplace=True)

In [3]:
np.random.seed(47)
companies_info.sample(10)

Unnamed: 0,ISIN,Security Name,Percent of Fund,Trade Country Name,Sector Classification,Industry Classification
5,US02079K3059,Alphabet Inc. Class A,2.803867,United States,Communication Services,Interactive Media & Services
182,US0844231029,W. R. Berkley Corporation,0.076069,United States,Financials,Insurance
35,US78409V1044,S&P Global Inc.,0.69665,United States,Financials,Capital Markets
102,US40434L1052,HP Inc.,0.193227,United States,Information Technology,Technology Hardware Storage & Peripherals
91,US3703341046,General Mills Inc.,0.210131,United States,Consumer Staples,Food Products
0,US0378331005,Apple Inc.,9.229723,United States,Information Technology,Technology Hardware Storage & Peripherals
157,US30212P3038,Expedia Group Inc.,0.10468,United States,Consumer Discretionary,Hotels Restaurants & Leisure
62,US29444U7000,Equinix Inc.,0.325185,United States,Real Estate,Equity Real Estate Investment Trusts (Reits)
10,US92826C8394,Visa Inc. Class A,1.875331,United States,Information Technology,It Services
63,US94106L1098,Waste Management Inc.,0.323418,United States,Industrials,Commercial Services & Supplies


In [4]:
import json
import requests
from bs4 import BeautifulSoup as bs
import time 
from fake_useragent import UserAgent

In [5]:
def get_ticker(ISIN: str) -> str:
    headers = {'User-Agent': UserAgent().chrome}
    link = 'http://www.morningstar.com/search?query={}'.format(ISIN)  
    response = requests.get(link, headers=headers)
    tree = bs(response.content, 'html.parser')
    ticker = tree.find('div', {'class': 'mdc-security-module search-all__hit'}).find('span', {'class': 'mdc-security-module__ticker'}).text
    time.sleep(3)
    return ticker

In [6]:
companies_info['ticker'] = companies_info['ISIN'].apply(get_ticker)

KeyboardInterrupt: 

In [45]:
companies_info.head(10)

Unnamed: 0,ISIN,Security Name,Percent of Fund,Trade Country Name,Sector Classification,Industry Classification,ticker
0,US0378331005,Apple Inc.,9.229723,United States,Information Technology,Technology Hardware Storage & Peripherals,AAPL
1,US5949181045,Microsoft Corporation,2.592074,United States,Information Technology,Software,MSFT
2,US88160R1014,Tesla,4.0,United States,Consumer Discretionary,Automobiles,TSLA
3,US0231351067,Amazon.com Inc.,2.013189,United States,Consumer Discretionary,Internet & Direct Marketing Retail,AMZN
4,US67066G1040,NVIDIA Corporation,2.932872,United States,Information Technology,Semiconductors & Semiconductor Equipment,NVDA
5,US02079K3059,Alphabet Inc. Class A,2.803867,United States,Communication Services,Interactive Media & Services,GOOGL
6,US02079K1079,Alphabet Inc. Class C,2.597737,United States,Communication Services,Interactive Media & Services,GOOG
7,US91324P1021,UnitedHealth Group Incorporated,2.38537,United States,Health Care,Health Care Providers & Services,UNH
8,US46625H1005,JPMorgan Chase & Co.,2.11993,United States,Financials,Banks,JPM
9,US30231G1022,Exxon Mobil Corporation,2.084553,United States,Energy,Oil Gas & Consumable Fuels,XOM


In [43]:
companies_info['ticker'].isna().sum()

0

In [48]:
companies_info.to_csv('Data/companies_info.csv')

## Загрузка данных

In [1]:
import numpy as np
import pandas as pd

In [2]:
companies_info = pd.read_csv(r'Data/companies_info.csv', index_col=0)

In [3]:
companies_info

Unnamed: 0,ISIN,Security Name,Percent of Fund,Trade Country Name,Sector Classification,Industry Classification,ticker
0,US0378331005,Apple Inc.,9.229723,United States,Information Technology,Technology Hardware Storage & Peripherals,AAPL
1,US5949181045,Microsoft Corporation,2.592074,United States,Information Technology,Software,MSFT
2,US88160R1014,Tesla,4.000000,United States,Consumer Discretionary,Automobiles,TSLA
3,US0231351067,Amazon.com Inc.,2.013189,United States,Consumer Discretionary,Internet & Direct Marketing Retail,AMZN
4,US67066G1040,NVIDIA Corporation,2.932872,United States,Information Technology,Semiconductors & Semiconductor Equipment,NVDA
...,...,...,...,...,...,...,...
208,BMG491BT1088,Invesco Ltd.,0.035094,United States,Financials,Capital Markets,IVZ
209,US9290421091,Vornado Realty Trust,0.029913,United States,Real Estate,Equity Real Estate Investment Trusts (Reits),VNO
210,US23918K1088,DaVita Inc.,0.027492,United States,Health Care,Health Care Providers & Services,DVA
211,US6936561009,PVH Corp.,0.022968,United States,Consumer Discretionary,Textiles Apparel & Luxury Goods,PVH


In [5]:
companies_info[companies_info['ticker'] == 'BLK']

Unnamed: 0,ISIN,Security Name,Percent of Fund,Trade Country Name,Sector Classification,Industry Classification,ticker
44,US09247X1019,BlackRock Inc.,0.534604,United States,Financials,Capital Markets,BLK


## Cбор данных

In [18]:
import os

In [19]:
CLIENT_ID = os.environ.get("CLIENT_ID")
SECRET_TOKEN = os.environ.get("SECRET_TOKEN")
PASSWORD = os.environ.get("PASSWORD")
username = 'RinFive'

In [20]:
import requests
from datetime import datetime

In [21]:
def get_access(username, CLIENT_ID, SECRET_TOKEN, PASSWORD):
    auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_TOKEN)
    data = {'grant_type': 'password',
        'username': username,
        'password': PASSWORD}
    headers = {'User-Agent': 'MyParser/0.1'}
    res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)
    TOKEN = res.json()['access_token']
    headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}
    return headers

In [22]:
headers = get_access(username, CLIENT_ID, SECRET_TOKEN, PASSWORD)

In [23]:
requests.get('https://oauth.reddit.com/api/v1/me',
             headers=headers)

<Response [200]>

In [24]:

# УЖЕ И НЕ НУЖНО ПОЛУЧАЕТСЯ
subreddits = ['stocks',
              'StockMarket',
              'Wallstreetbetsnews',
              'Investing']
             # 'sustainableFinance'] #без поисковых слов 
# ===================================================

searchwords = ['eco',
               'ESG',
               'Ethical',
               'Green',
               'ecology',
               'social',
               'governance',
               'sustainable',
               'Socially responsible investing',
               'responsible',
               # ==========
               'greenwashing',
               'unethical']

E = ['polution',
     'emission',
     'carbon']

S = ['sexual',
     'unemployment',
     'rights',
     'unemployment',
     'child labour',]

G = ['corruption',
     'bribery']



In [25]:
params = {}

In [26]:
def scrape_info(response, name, info_df): 
    df = pd.DataFrame()
    for post in response.json()['data']['children']:
        df = df.append({
            # ======== about company ========
            'company' : list(info_df[info_df['ticker'] == name]['Security Name'])[0],
            'share_in_index': float(info_df[info_df['ticker'] == name]['Percent of Fund']),
            'ticker': name,
            # ======== text info ========
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'num_comments': post['data']['num_comments'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%d'),
            'name': post['data']['name']},
        ignore_index=True)
    return df

In [59]:
params['q'] = 'ESG' 
params['limit'] = 2
params['sort'] = 'hot'
params['t'] = 'all'
params['restrict_sr'] = False

In [60]:
subreddits[0]

'stocks'

In [61]:
params

{'q': 'ESG', 'limit': 2, 'sort': 'hot', 't': 'all', 'restrict_sr': False}

In [62]:
response = requests.get('https://oauth.reddit.com/subreddits/search',
                       headers=headers,
                       params=params)
#requests.get(api_url + '/r/{}/top'.format(s), headers=headers, params=payload)

In [63]:
response.json()['data']['children']

[{'kind': 't5',
  'data': {'user_flair_background_color': None,
   'submit_text_html': None,
   'restrict_posting': True,
   'user_is_banned': False,
   'free_form_reports': True,
   'wiki_enabled': None,
   'user_is_muted': False,
   'user_can_flair_in_sr': None,
   'display_name': 'ESG',
   'header_img': None,
   'title': 'Eisen and the Sunshine Gang: RUGC Highlander Team!',
   'original_content_tag_enabled': False,
   'allow_galleries': True,
   'icon_size': None,
   'primary_color': '',
   'active_user_count': None,
   'icon_img': '',
   'display_name_prefixed': 'r/ESG',
   'accounts_active': None,
   'public_traffic': False,
   'subscribers': 208,
   'user_flair_richtext': [],
   'videostream_links_count': 0,
   'name': 't5_2tb8a',
   'quarantine': False,
   'hide_ads': False,
   'prediction_leaderboard_entry_type': 'IN_FEED',
   'emojis_enabled': False,
   'advertiser_category': '',
   'public_description': '',
   'comment_score_hide_mins': 0,
   'allow_predictions': False,
   'u

In [82]:
data = pd.DataFrame()
for company in companies_info['ticker']:
    for searchword in searchwords:
        search = f'{company} {searchword}'
        response = requests.get(f'https://oauth.reddit.com/subreddits/search',
                       headers=headers,
                       params=params)

        data = data.append(scrape_info(response, company, companies_info), ignore_index=True)
        time.sleep(1)
            

JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [11]:
response.json()['data']['children'][0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'stocks',
  'selftext': "I've been loaded up on this heavy since xmas. It's been frustrating watching it go sideways or down even though they more than doubled their expected earnings last quarter. I'll list my thoughts individually so someone can poke holes in my thinking if they are so inclined.  \n1) Positive: There is a 5 million home deficit right now, which is predicted to take many years to catch up to.  \n2) Positive: Millennials are the largest demographic in the US and they are technically poised to purchase their first or second homes within the next 10 years. Gen Z isn't as large but the older end of the spectrum should be buying their first homes soon.  \n3) Positive: Wildfires and hurricanes seem routine at this point. That should contribute reliable demand in the form to rebuilds and repairs. Maybe I'm wrong about this and the add on is trivial.  \n4) Positive: I know PE isn't everything, but is it nothing?

In [23]:
df = scrape_info(response)

In [15]:
df['text'] = df['title'] + ' ' + df['selftext']
df.drop(['title', 'selftext'], axis=1, inplace=True)

In [16]:
df

Unnamed: 0,subreddit,upvote_ratio,num_comments,ups,downs,score,created_utc,name,text
0,stocks,1.00,1.0,2.0,0.0,2.0,2022-05-31,t3_v1xrfr,NOBL - what are your thoughts? I’m adding some...
1,stocks,1.00,1.0,1.0,0.0,1.0,2022-05-31,t3_v1xr3k,Dell Valuation # Valuation\n\nFree Cash Flow T...
2,stocks,0.67,2.0,1.0,0.0,1.0,2022-05-31,t3_v1x1ec,OPEC Weighs Suspending Russia From Oil-Product...
3,stocks,0.94,8.0,16.0,0.0,16.0,2022-05-31,t3_v1wjar,Wall Street Thinks Any Stock Market Rally Will...
4,stocks,1.00,2.0,4.0,0.0,4.0,2022-05-31,t3_v1w86e,Foreign national buying stocks Is there a way ...
...,...,...,...,...,...,...,...,...,...
95,stocks,0.68,25.0,8.0,0.0,8.0,2022-05-28,t3_uzr0kj,Dealing with a stock broker and what do expect...
96,stocks,0.42,31.0,0.0,0.0,0.0,2022-05-28,t3_uzqa6x,What's your thoughts on holding S&amp;P 500? S...
97,stocks,0.67,0.0,2.0,0.0,2.0,2022-05-28,t3_uzq9f5,r/Stocks Weekly Thread on Meme Stocks Saturday...
98,stocks,0.64,46.0,12.0,0.0,12.0,2022-05-28,t3_uzpw2p,can stocks gain or lose value when the market ...


### Обработка текста 

In [17]:
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from typing import List

#### Сделаем токенизацию 

In [18]:
tk = TweetTokenizer()

In [19]:
df['text'] = df['text'].apply(tk.tokenize)

In [20]:
df.head()

Unnamed: 0,subreddit,upvote_ratio,num_comments,ups,downs,score,created_utc,name,text
0,stocks,1.0,1.0,2.0,0.0,2.0,2022-05-31,t3_v1xrfr,"[NOBL, -, what, are, your, thoughts, ?, I, ’, ..."
1,stocks,1.0,1.0,1.0,0.0,1.0,2022-05-31,t3_v1xr3k,"[Dell, Valuation, #, Valuation, Free, Cash, Fl..."
2,stocks,0.67,2.0,1.0,0.0,1.0,2022-05-31,t3_v1x1ec,"[OPEC, Weighs, Suspending, Russia, From, Oil-P..."
3,stocks,0.94,8.0,16.0,0.0,16.0,2022-05-31,t3_v1wjar,"[Wall, Street, Thinks, Any, Stock, Market, Ral..."
4,stocks,1.0,2.0,4.0,0.0,4.0,2022-05-31,t3_v1w86e,"[Foreign, national, buying, stocks, Is, there,..."


#### Уберем стоп-слова

In [21]:
stop_words = nltk.corpus.stopwords.words('english')

*Пример стоп слов:*

In [22]:
for word in stop_words[:10]:
    print(f'---{word}')

---i
---me
---my
---myself
---we
---our
---ours
---ourselves
---you
---you're


#### Стэмминг текста

In [45]:
snowball = SnowballStemmer(language='english')

In [46]:
df['text'] = df['text'].apply(lambda tokens: [snowball.stem(token) for token in tokens] )

In [47]:
df.head()

Unnamed: 0,subreddit,upvote_ratio,num_comments,ups,downs,score,created_utc,name,text
0,stocks,0.78,7.0,5.0,0.0,5.0,2022-05-30,t3_v0zuvq,"[7, tech, stock, that, are, most, worthi, of, ..."
1,stocks,0.29,14.0,0.0,0.0,0.0,2022-05-30,t3_v0ye7w,"[for, the, valu, investor, :, i, present, to, ..."
2,stocks,0.67,19.0,3.0,0.0,3.0,2022-05-30,t3_v0xi4p,"[beaten, down, stock, i, have, been, an, index..."
3,stocks,0.56,19.0,2.0,0.0,2.0,2022-05-30,t3_v0x2v9,"[let, discuss, inflat, proof, stock, which, st..."
4,stocks,0.92,27.0,10.0,0.0,10.0,2022-05-30,t3_v0wec2,"[r, /, stock, daili, discuss, monday, -, may, ..."


In [None]:
def my_preprocessor (text: str) -> str:
    return text.lower()

In [37]:
def tokenize_data(text: str) -> List[str]:
    
    tk = TweetTokenizer()
    #preprocessing
    text = text.lower()
    
    stop_words = nltk.corpus.stopwords.words('english')
    pass
    

In [20]:
snowball.stem('bribery')

'briberi'