In [1]:
import sys
from pathlib import Path

project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

import os
os.chdir(project_root)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.utils.config import load_config
from src.data.loader import load_and_filter_dataset

In [6]:
from datasets import load_dataset
from pathlib import Path
import pandas as pd
import zipfile
import io

print("Сваляне на dataset от Hugging Face...")
dataset_name = "pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs"

try:
    print("Опитвам се да заредя само price данните...")
    dataset = load_dataset(
        dataset_name,
        data_files="sp500_daily_ratios_20yrs.zip",
        download_mode="force_redownload"
    )
    print(f"Dataset зареден успешно!")
    print(f"Splits: {list(dataset.keys())}")
    
    split_name = list(dataset.keys())[0]
    df = dataset[split_name].to_pandas()
    
    print(f"Конвертирано в pandas. Размер: {df.shape}")
    print(f"Колони: {list(df.columns)[:15]}...")
    
    output_path = Path("data/raw/sp500_stocks_data.parquet")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    df.to_parquet(output_path, index=False)
    print(f"\nDataset запазен локално в: {output_path}")
    print(f"Размер на файла: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
    
except Exception as e:
    print(f"Грешка при сваляне с data_files: {e}")
    print("\nОпитвам се алтернативен метод - директно от zip файла...")
    try:
        from huggingface_hub import hf_hub_download
        import zipfile
        
        zip_path = hf_hub_download(
            repo_id=dataset_name,
            filename="sp500_daily_ratios_20yrs.zip",
            repo_type="dataset"
        )
        
        print(f"Zip файл свалени в: {zip_path}")
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            csv_files = [f for f in zip_ref.namelist() if f.endswith('.csv')]
            print(f"Намерени CSV файлове: {csv_files[:5]}...")
            
            if csv_files:
                first_csv = csv_files[0]
                with zip_ref.open(first_csv) as f:
                    df = pd.read_csv(f)
                    print(f"Зареден CSV файл: {first_csv}")
                    print(f"Размер: {df.shape}")
                    print(f"Колони: {list(df.columns)[:15]}...")
                    
                    output_path = Path("data/raw/sp500_stocks_data.parquet")
                    output_path.parent.mkdir(parents=True, exist_ok=True)
                    df.to_parquet(output_path, index=False)
                    print(f"\nDataset запазен локално в: {output_path}")
    except Exception as e2:
        print(f"Грешка при алтернативен метод: {e2}")
        print("\nМоля, провери интернет връзката и опитай отново.")


Сваляне на dataset от Hugging Face...
Опитвам се да заредя само price данните...


Repo card metadata block was not found. Setting CardData to empty.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sp500_daily_ratios_20yrs.zip:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset зареден успешно!
Splits: ['train']
Конвертирано в pandas. Размер: (1048575, 23)
Колони: ['Ticker', 'Date', 'Open', 'Close', 'Volume', 'quarter', 'year', 'Asset Turnover', 'Current Ratio', 'Days Sales In Receivables', 'Debt/Equity Ratio', 'EBIT Margin', 'EBITDA Margin', 'Gross Margin', 'Inventory Turnover Ratio']...

Dataset запазен локално в: data\raw\sp500_stocks_data.parquet
Размер на файла: 11.41 MB


In [4]:
config = load_config()
print(f"Dataset: {config.data.dataset_name}")
print(f"Tickers: {config.data.tickers}")
print(f"Start date: {config.data.start_date}")
print(f"End date: {config.data.end_date}")

Dataset: pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs
Tickers: ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META']
Start date: 2010-01-01
End date: None


In [5]:
print("Зареждане на данни...")
df = load_and_filter_dataset(config=config)
print(f"Заредени {len(df)} реда")

Зареждане на данни...
Зареждане на dataset от Hugging Face: pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs
Това може да отнеме няколко минути при първо зареждане...


Repo card metadata block was not found. Setting CardData to empty.


Generating train split:   0%|          | 0/290728 [00:00<?, ? examples/s]

Грешка при зареждане: An error occurred while generating the dataset

All the data files must have the same columns, but at some point there are 23 new columns ({'Return On Tangible Equity', 'Inventory Turnover Ratio', 'ROE - Return On Equity', 'Gross Margin', 'Ticker', 'Receiveable Turnover', 'year', 'Open', 'Asset Turnover', 'Pre-Tax Profit Margin', 'Current Ratio', 'Debt/Equity Ratio', 'quarter', 'Long-term Debt / Capital', 'ROA - Return On Assets', 'Close', 'Volume', 'EBIT Margin', 'Net Profit Margin', 'Date', 'Operating Margin', 'EBITDA Margin', 'Days Sales In Receivables'}) and 9 missing columns ({'time', 'neg', '_id', 'pos', 'ticker', 'neu', 'headline', 'compound', 'date'}).

This happened while the csv dataset builder was generating data using

zip://sp500_daily_ratios_20yrs.csv::C:\Users\vyoto\.cache\huggingface\hub\datasets--pmoe7--SP_500_Stocks_Data-ratios_news_price_10_yrs\snapshots\8ae97891ad22af1be38cad1a8a88997bfc8862bd\sp500_daily_ratios_20yrs.zip, [C:\Users\vyoto\.cach

Repo card metadata block was not found. Setting CardData to empty.


Generating train split:   0%|          | 0/290728 [00:00<?, ? examples/s]

Грешка: An error occurred while generating the dataset

All the data files must have the same columns, but at some point there are 23 new columns ({'Return On Tangible Equity', 'Inventory Turnover Ratio', 'ROE - Return On Equity', 'Gross Margin', 'Ticker', 'Receiveable Turnover', 'year', 'Open', 'Asset Turnover', 'Pre-Tax Profit Margin', 'Current Ratio', 'Debt/Equity Ratio', 'quarter', 'Long-term Debt / Capital', 'ROA - Return On Assets', 'Close', 'Volume', 'EBIT Margin', 'Net Profit Margin', 'Date', 'Operating Margin', 'EBITDA Margin', 'Days Sales In Receivables'}) and 9 missing columns ({'time', 'neg', '_id', 'pos', 'ticker', 'neu', 'headline', 'compound', 'date'}).

This happened while the csv dataset builder was generating data using

zip://sp500_daily_ratios_20yrs.csv::C:\Users\vyoto\.cache\huggingface\hub\datasets--pmoe7--SP_500_Stocks_Data-ratios_news_price_10_yrs\snapshots\8ae97891ad22af1be38cad1a8a88997bfc8862bd\sp500_daily_ratios_20yrs.zip, [C:\Users\vyoto\.cache\huggingface\

DatasetGenerationCastError: An error occurred while generating the dataset

All the data files must have the same columns, but at some point there are 23 new columns ({'Return On Tangible Equity', 'Inventory Turnover Ratio', 'ROE - Return On Equity', 'Gross Margin', 'Ticker', 'Receiveable Turnover', 'year', 'Open', 'Asset Turnover', 'Pre-Tax Profit Margin', 'Current Ratio', 'Debt/Equity Ratio', 'quarter', 'Long-term Debt / Capital', 'ROA - Return On Assets', 'Close', 'Volume', 'EBIT Margin', 'Net Profit Margin', 'Date', 'Operating Margin', 'EBITDA Margin', 'Days Sales In Receivables'}) and 9 missing columns ({'time', 'neg', '_id', 'pos', 'ticker', 'neu', 'headline', 'compound', 'date'}).

This happened while the csv dataset builder was generating data using

zip://sp500_daily_ratios_20yrs.csv::C:\Users\vyoto\.cache\huggingface\hub\datasets--pmoe7--SP_500_Stocks_Data-ratios_news_price_10_yrs\snapshots\8ae97891ad22af1be38cad1a8a88997bfc8862bd\sp500_daily_ratios_20yrs.zip, [C:\Users\vyoto\.cache\huggingface\hub\datasets--pmoe7--SP_500_Stocks_Data-ratios_news_price_10_yrs\snapshots\8ae97891ad22af1be38cad1a8a88997bfc8862bd\sp500_daily_ratios_20yrs.zip (origin=hf://datasets/pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs@8ae97891ad22af1be38cad1a8a88997bfc8862bd/sp500_daily_ratios_20yrs.zip), C:\Users\vyoto\.cache\huggingface\hub\datasets--pmoe7--SP_500_Stocks_Data-ratios_news_price_10_yrs\snapshots\8ae97891ad22af1be38cad1a8a88997bfc8862bd\sp500_news_290k_articles.csv (origin=hf://datasets/pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs@8ae97891ad22af1be38cad1a8a88997bfc8862bd/sp500_news_290k_articles.csv)]

Please either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)

In [None]:
print(f"Размери: {df.shape}")
print(f"\nКолони:")
print(df.columns.tolist())

In [None]:
unique_tickers = df['symbol'].unique() if 'symbol' in df.columns else None
if unique_tickers is not None:
    print(f"Уникални тикери ({len(unique_tickers)}):")
    print(sorted(unique_tickers))
else:
    print("Не е намерена колона 'symbol'")

In [None]:
date_col = 'date' if 'date' in df.columns else None
if date_col:
    df[date_col] = pd.to_datetime(df[date_col])
    print(f"Период:")
    print(f"  От: {df[date_col].min()}")
    print(f"  До: {df[date_col].max()}")
    print(f"  Дни: {(df[date_col].max() - df[date_col].min()).days}")

In [None]:
print("Първи 10 реда:")
df.head(10)

In [None]:
print("Базова статистика:")
df.describe()

In [None]:
missing = df.isnull().sum()
if missing.sum() > 0:
    print("Липсващи стойности:")
    print(missing[missing > 0])
else:
    print("Няма липсващи стойности")

In [3]:
import matplotlib.pyplot as plt
import numpy as np
def plot_adc_conversion():
# Генериране на аналогов сигнал
t = np.linspace(0, 1, 1000)
analog_signal = 2.5 + 1.5 * np.sin(2 * np.pi * 5 * t)
# ADC параметри
resolution = 8
v_ref = 5.0
sampling_rate = 50
# Дискретизация и квантуване
sample_times = np.arange(0, 1, 1/sampling_rate)
samples = 2.5 + 1.5 * np.sin(2 * np.pi * 5 * sample_times)
digital_samples = np.round((samples / v_ref) * (2**resolution - 1))
quantized_voltage = (digital_samples / (2**resolution - 1)) * v_ref
# Визуализация
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
ax1.plot(t, analog_signal, &#39;b-&#39;, label=&#39;Аналогов сигнал&#39;, alpha=0.7)
ax1.stem(sample_times, quantized_voltage, &#39;r-&#39;, markerfmt=&#39;ro&#39;,
label=&#39;Дискретизирани проби&#39;)
ax1.legend()
ax1.grid(True)
quantization_error = samples - quantized_voltage
ax2.stem(sample_times, quantization_error, &#39;g-&#39;, markerfmt=&#39;go&#39;)
ax2.set_title(&#39;Квантова грешка&#39;)
ax2.grid(True)
plt.tight_layout()
plt.show()
plot_adc_conversion()

IndentationError: expected an indented block after function definition on line 3 (3043839700.py, line 5)

In [6]:
import ipywidgets as widgets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import numpy as np
def calculate_adc_parameters(resolution, v_ref, analog_input):
    steps = 2 ** resolution
    lsb = v_ref / steps
    digital_value = int((analog_input / v_ref) * steps)
    quantization_error = lsb / 2
    print(f&quot;=== Резултати за {resolution}-битов АЦП ===&quot;)
    print(f&quot;Брой стъпки: {steps}&quot;)
    print(f&quot;Разделителна способност (LSB): {lsb:} V&quot;)
    print(f&quot;Цифрова стойност за {analog_input}V: {digital_value}&quot;)
    print(f&quot;Максимална квантова грешка: ±{quantization_error:} V&quot;)
     return steps, lsb, digital_value, quantization_error
# Интерактивни контроли
resolution_slider = widgets.IntSlider(value=10, min=8, max=16,
description=&#39;Битове:&#39;)
vref_slider = widgets.FloatSlider(value=3.3, min=1.8, max=5.0,
step=0.1, description=&#39;Vref:&#39;)
input_slider = widgets.FloatSlider(value=1.65, min=0, max=3.3,
step=0.01, description=&#39;Vвход:&#39;)
def on_parameter_change(change):
     calculate_adc_parameters(
        resolution_slider.value,
        vref_slider.value,
        input_slider.value
    )
resolution_slider.observe(on_parameter_change, names=&#39;value&#39;)
vref_slider.observe(on_parameter_change, names=&#39;value&#39;)
input_slider.observe(on_parameter_change, names=&#39;value&#39;)
display(resolution_slider, vref_slider, input_slider)
calculate_adc_parameters(10, 3.3, 1.65)

SyntaxError: invalid character '±' (U+00B1) (3808661557.py, line 14)