In [67]:
import requests
import time 

import pandas as pd
from datetime import datetime

import os
import pathlib
import sys
import logging
import io

logging.basicConfig(filename='logs.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


import boto3

import  matplotlib.pyplot as plt

# Инициализируем подключение к ОХ
s3 = boto3.client('s3')
pd.set_option('display.max_rows', 50)

2025-03-10 16:05:55,377 - INFO - Found endpoint for s3 via: config_global.


# Просмотр существующих тикеров 

In [2]:
idx = 0
stock_data = pd.DataFrame()

while True:
    print('Processing chunk...', idx)
    response = requests.get(f'https://iss.moex.com/iss/securities.json?engine=stock&market=shares&start={idx}').json()['securities']
    
    if len(response['data']) == 0:
        break 

    stock_data_new = pd.DataFrame(
        columns=response['columns'],
        data=response['data']
    )

    stock_data = pd.concat([stock_data, stock_data_new])
    idx += 100
    time.sleep(2)

Processing chunk... 0
Processing chunk... 100
Processing chunk... 200
Processing chunk... 300
Processing chunk... 400
Processing chunk... 500
Processing chunk... 600
Processing chunk... 700
Processing chunk... 800
Processing chunk... 900
Processing chunk... 1000
Processing chunk... 1100
Processing chunk... 1200
Processing chunk... 1300
Processing chunk... 1400
Processing chunk... 1500
Processing chunk... 1600
Processing chunk... 1700
Processing chunk... 1800
Processing chunk... 1900
Processing chunk... 2000
Processing chunk... 2100
Processing chunk... 2200
Processing chunk... 2300
Processing chunk... 2400
Processing chunk... 2500


In [None]:
stock_data.to_csv(os.path.join('..', 'data', 'stock_meta_data.csv'))

In [29]:
with open(os.path.join('..', 'data', 'stock_meta_data.csv'), 'rb') as f:
    data = f.read()

s3.put_object(Bucket='portfoliodata', Key='stock_meta_data.csv', Body=data)

{'ResponseMetadata': {'RequestId': '0de45135fba3a6a8',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'nginx',
   'date': 'Sat, 08 Mar 2025 18:28:13 GMT',
   'content-type': 'application/octet-stream',
   'transfer-encoding': 'chunked',
   'connection': 'keep-alive',
   'keep-alive': 'timeout=60',
   'etag': '"fa9b7b7d677c8fb3c263516e8088dda2"',
   'x-amz-request-id': '0de45135fba3a6a8'},
  'RetryAttempts': 0},
 'ETag': '"fa9b7b7d677c8fb3c263516e8088dda2"'}

# Получаем данные котировок

In [3]:
stock_data = pd.read_csv(os.path.join('..', 'data', 'stock_meta_data.csv'))

In [60]:
engine = 'stock'
market = 'shares'
start_date = '2008-01-01'
end_date = '2024-12-31'
interval = '24'

In [61]:
api_url = (
    'https://iss.moex.com/iss/engines/{engine}/'
    'markets/{market}/securities/{ticker}/candles.json'
    '?from={start_date}&till={end_date}&interval={interval}&start={start}'
)

def collect_moex_candles(stock_data, s3):
    """
    Собирает данные о дневных котировках (цены закрытия и пр.) за период 2008-2024
    и сохраняет результат в S3 в формате CSV.

    :param stock_data: DataFrame (или иной объект) со списком тикеров в поле 'secid'
    :param s3: клиент S3 (boto3.client), настроенный под ваше хранилище
    """
    for ticker in stock_data['secid']:
        logger.info('Collecting data for ticker: %s', ticker)
        
        all_rows = []
        start = 0 
        while True:
            url = api_url.format(
                engine=engine,
                market=market,
                ticker=ticker,
                start_date=start_date,
                end_date=end_date,
                interval=interval,
                start=start
            )
            logger.info('Requesting URL: %s', url)
            try:
                resp = requests.get(url, timeout=100000)
                resp.raise_for_status()  
                j = resp.json()
            except requests.exceptions.RequestException as e:
                logger.error("Request failed for %s: %s", ticker, e)
                break
            
            candles_data = j.get('candles', {}).get('data', [])
            columns = j.get('candles', {}).get('columns', [])
            
            if not candles_data:
                logger.info('No more data for %s. Processed until start=%s', ticker, start)
                break
            
            for row in candles_data:
                row_dict = {col: row[i] for i, col in enumerate(columns)}
                all_rows.append(row_dict)
            
            logger.info("Received %d rows for %s at start=%d", len(candles_data), ticker, start)
            start += len(candles_data)
            
            time.sleep(0.3)
        
        frame = pd.DataFrame(all_rows)
        logger.info("Total extracted for %s: %d rows", ticker, len(frame))
        
        csv_data = frame.to_csv(index=False).encode('utf-8')
        try:
            s3.put_object(
                Bucket='portfoliodata',
                Key=f'{ticker}.csv',
                Body=csv_data
            )
            logger.info("Successful upload to S3: %s.csv", ticker)
        except Exception as e:
            logger.error("S3 upload failed for %s: %s", ticker, e)
        
        time.sleep(1)

In [63]:
collect_moex_candles(stock_data, s3)

2025-03-09 22:22:40,358 - INFO - Collecting data for ticker: ABIO
2025-03-09 22:22:40,359 - INFO - Requesting URL: https://iss.moex.com/iss/engines/stock/markets/shares/securities/ABIO/candles.json?from=2008-01-01&till=2024-12-31&interval=24&start=0
2025-03-09 22:22:40,466 - INFO - Received 500 rows for ABIO at start=0
2025-03-09 22:22:40,772 - INFO - Requesting URL: https://iss.moex.com/iss/engines/stock/markets/shares/securities/ABIO/candles.json?from=2008-01-01&till=2024-12-31&interval=24&start=500
2025-03-09 22:22:40,861 - INFO - Received 500 rows for ABIO at start=500
2025-03-09 22:22:41,166 - INFO - Requesting URL: https://iss.moex.com/iss/engines/stock/markets/shares/securities/ABIO/candles.json?from=2008-01-01&till=2024-12-31&interval=24&start=1000
2025-03-09 22:22:41,495 - INFO - Received 500 rows for ABIO at start=1000
2025-03-09 22:22:41,798 - INFO - Requesting URL: https://iss.moex.com/iss/engines/stock/markets/shares/securities/ABIO/candles.json?from=2008-01-01&till=2024-1

# Объединение данных в единую таблицу

In [None]:
body = s3.get_object(Bucket='portfoliodata', Key='stock_meta_data.csv')['Body'].read()


In [53]:
stock_data = pd.read_csv(io.BytesIO(body))
stock_data.head()


Unnamed: 0.1,Unnamed: 0,id,secid,shortname,regnumber,name,isin,is_traded,emitent_id,emitent_title,emitent_inn,emitent_okpo,gosreg,type,group,primary_boardid,marketprice_boardid
0,0,424433251,ABIO,iАРТГЕН ао,1-01-08902-A,"ПАО ""Артген""",RU000A0JNAB6,1,1142.0,"Публичное акционерное общество ""Артген биотех""",7702509000.0,71328785.0,1-01-08902-A,common_share,stock_shares,TQBR,TQBR
1,1,12441,ABRD,АбрауДюрсо,1-02-12500-A,Абрау-Дюрсо ПАО ао,RU000A0JS5T7,1,4787.0,"Публичное акционерное общество ""Абрау – Дюрсо""",7727621000.0,81521198.0,1-02-12500-A,common_share,stock_shares,TQBR,TQBR
2,2,2699,AFKS,Система ао,1-05-01669-A,"АФК ""Система"" ПАО ао",RU000A0DQZE3,1,777.0,"Публичное акционерное общество ""Акционерная фи...",7703105000.0,27987276.0,1-05-01669-A,common_share,stock_shares,TQBR,TQBR
3,3,2700,AFLT,Аэрофлот,1-01-00010-A,Аэрофлот-росс.авиалин(ПАО)ао,RU0009062285,1,1242.0,"Публичное акционерное общество ""Аэрофлот – рос...",7712040000.0,29063984.0,1-01-00010-A,common_share,stock_shares,TQBR,TQBR
4,4,76079,AGRO,AGRO-гдр,,ГДР ROS AGRO PLC ORD SHS,US7496552057,1,7502.0,ROS AGRO PLC,10034870.0,,,depositary_receipt,stock_dr,TQBR,TQBR


In [70]:
BUCKET_NAME = 'portfoliodata'
PREFIX = ''  # Если файлы не в папке, оставьте пустым

EXPECTED_START_STR = '2008-01-09'
EXPECTED_END_STR = '2024-12-30'

# Преобразуем строки в datetime
EXPECTED_START_DT = datetime.strptime(EXPECTED_START_STR, '%Y-%m-%d')
EXPECTED_END_DT = datetime.strptime(EXPECTED_END_STR, '%Y-%m-%d')

MAX_DEVIATION_DAYS = 14

def check_and_clean_data(bucket: str, prefix: str):
    """
    Проходит по всем CSV-файлам в S3, проверяет диапазон дат.
    Допускается расхождение от границ (start/end) в 14 дней.
    Если дата выходит за эту погрешность, файл удаляется из хранилища.
    """
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
    
    
    for tiker in stock_data['secid']:
        key = f'{tiker}.csv'
        logger.info(f'Checking file: {key}')
        
        # 1) Скачиваем файл из S3
        try:
            response = s3.get_object(Bucket=bucket, Key=key)
            body = response['Body'].read()
        except Exception as e:
            logger.error(f'Failed to download {key}: {e}')
            continue
        
        # 2) Парсим CSV
        try:
            df = pd.read_csv(io.BytesIO(body))
        except Exception as e:
            logger.error(f'Failed to parse CSV {key}: {e}')
            s3.delete_object(Bucket=bucket, Key=key)
            continue
        
        # Предположим, что колонка с датой называется 'begin'; подставьте вашу
        if 'begin' not in df.columns:
            s3.delete_object(Bucket=bucket, Key=key)
            logger.warning(f'No "begin" column in {key}, skip checking.')
            continue
        
        df['begin'] = pd.to_datetime(df['begin'], errors='coerce')
        valid_dates = df['begin'].dropna()
        if valid_dates.empty:
            logger.info(f'File {key} has no valid dates, skipping.')
            continue
        
        min_date = valid_dates.min()
        max_date = valid_dates.max()
        
        logger.info(f'{key} -> min_date={min_date.date()}, max_date={max_date.date()}')
        
        # 3) Вычисляем, вписываются ли min/max в допустимый коридор ±14 дней
        #    относительно 2008-01-01 и 2024-12-31
        diff_min = abs((min_date - EXPECTED_START_DT).days)
        diff_max = abs((max_date - EXPECTED_END_DT).days)
        
        # Условие: если ЛЮБОЕ расхождение > 14, тогда файл удаляем
        if diff_min > MAX_DEVIATION_DAYS or diff_max > MAX_DEVIATION_DAYS:
            logger.info(f'Deleting {key} due to date range mismatch (>14 days).')
            try:
                s3.delete_object(Bucket=bucket, Key=key)
            except Exception as e:
                logger.error(f'Failed to delete {key}: {e}')
        else:
            logger.info(f'File {key} is within ±{MAX_DEVIATION_DAYS} days — keep it.')

# Пример вызова
check_and_clean_data(BUCKET_NAME, PREFIX)

2025-03-10 16:25:43,963 - INFO - Checking file: ABIO.csv
2025-03-10 16:25:44,051 - ERROR - Failed to download ABIO.csv: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.
2025-03-10 16:25:44,051 - INFO - Checking file: ABRD.csv
2025-03-10 16:25:44,062 - ERROR - Failed to download ABRD.csv: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.
2025-03-10 16:25:44,063 - INFO - Checking file: AFKS.csv
2025-03-10 16:25:44,194 - INFO - AFKS.csv -> min_date=2008-01-09, max_date=2024-12-30
2025-03-10 16:25:44,195 - INFO - File AFKS.csv is within ±14 days — keep it.
2025-03-10 16:25:44,195 - INFO - Checking file: AFLT.csv
2025-03-10 16:25:44,316 - INFO - AFLT.csv -> min_date=2008-01-09, max_date=2024-12-30
2025-03-10 16:25:44,317 - INFO - File AFLT.csv is within ±14 days — keep it.
2025-03-10 16:25:44,317 - INFO - Checking file: AGRO.csv
2025-03-10 16:25:44,330 - ERROR - Failed to download AGRO.c