# Сбор данных

In [5]:
# Здесь парсим историю сделок Ненси Пелоси с сайта:
# https://www.quiverquant.com/home/

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime

data_tmp = pd.DataFrame(columns=['Transaction','Amount','Stock','Type','Traded','Disclosed','Description'])

for p in range(1,182,1):
    url = f'https://www.quiverquant.com/congresstrading/trade/House-P000197-{p}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    values = []
    try:
        values.append(soup.find('p', string='Transaction').find_next('span').text.strip())
    except AttributeError:
        values.append(np.nan)
    try:
        values.append(soup.find('p', string='Amount').find_next('span').text.strip())
    except AttributeError:
        values.append(np.nan)
    try:
        values.append(soup.find('p', string='Stock').find_next('a').text.strip())
    except AttributeError:
        values.append(np.nan)
    try:
        values.append(soup.find('span', 'comp-name').find_next('span').text.strip())
    except AttributeError:
        values.append(np.nan)
    try:
        values.append(soup.find('p', string='Traded').find_next('span').text.strip())
    except AttributeError:
        values.append(np.nan)
    try:
        values.append(soup.find('p', string='Disclosed').find_next('span').text.strip())
    except AttributeError:
        values.append(np.nan)
    try:
        values.append(soup.find('p', string='Description').find_next('span', class_='comp-name detail-desc').text.strip())
    except AttributeError:
        values.append(np.nan)

    new_row = pd.Series(values, index=data_tmp.columns)
    data_tmp.loc[len(data_tmp)] = new_row

data_tmp['Shares'] = data_tmp['Description'].str.replace(",", "").str.findall(r'(\d+) SHARES').str[0]
data_tmp['Strike'] = pd.to_numeric(data_tmp['Description'].str.extract(r'STRIKE PRICE OF \$(\d+)', expand=False))
data_tmp.head(3)

In [6]:
data_tmp['traded_dt'] = data_tmp['Traded'].apply(lambda x:  datetime.strptime(x, '%b. %d, %Y') if pd.notna(x) else None)
data_tmp['disclosed_dt'] = data_tmp['Disclosed'].apply(lambda x:  datetime.strptime(x, '%b. %d, %Y') if pd.notna(x) else None)

In [7]:
# Преобразуем объем сделки в удобный формат

def get_range(str, mode='min'):
    if pd.isna(str):
        return None
    num = 0 if mode=='min' else 2
    amount = str.split()[num][1:]
    amount = amount.replace(',','')
    amount = float(amount)
    return amount


data_tmp['MinAmount'] = data_tmp['Amount'].apply(get_range)
data_tmp['MaxAmount'] = data_tmp['Amount'].apply(lambda x: get_range(x, 'max'))
data = data_tmp.drop('Amount', axis=1)
data.head(3)

Unnamed: 0,Transaction,Stock,Type,Traded,Disclosed,Description,Shares,Strike,traded_dt,disclosed_dt,MinAmount,MaxAmount
0,Sale,MGRC,,"Sep. 22, 2014","Nov. 05, 2014",CONTRIBUTION OF SHARES HELD PERSONALLY TO THE ...,,,2014-09-22,2014-11-05,50001.0,100000.0
1,Sale,RHI,,"Sep. 22, 2014","Nov. 05, 2014",CONTRIBUTION OF SHARES HELD PERSONALLY TO THE ...,,,2014-09-22,2014-11-05,100001.0,250000.0
2,Sale,AA,,"Oct. 23, 2014","Nov. 05, 2014",,,,2014-10-23,2014-11-05,15001.0,50000.0


In [8]:
data.to_csv('data/trading_data.csv')