# Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("history.csv", nrows=int(1e5))
df = df[~df.Symbol.isin((df[df["Close"] == 0].Symbol).unique())]

In [3]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Symbol
0,2008-01-29,9.50,9.99,8.57,8.75,0.702589,1489000,AACG
1,2008-01-30,8.75,9.15,8.30,8.50,0.682516,219000,AACG
2,2008-01-31,8.49,10.30,8.49,9.55,0.766826,182300,AACG
3,2008-02-01,9.93,9.94,9.50,9.51,0.763614,28200,AACG
4,2008-02-04,9.50,9.71,9.50,9.50,0.762811,8300,AACG
...,...,...,...,...,...,...,...,...
99995,2021-04-15,40.00,41.00,37.50,37.50,37.500000,50092,ABEO
99996,2021-04-16,37.50,38.00,35.75,37.25,37.250000,63172,ABEO
99997,2021-04-19,36.50,37.25,34.00,35.25,35.250000,47496,ABEO
99998,2021-04-20,34.25,37.00,34.00,36.50,36.500000,38796,ABEO


# Обработка данных
1) Нужно разделить данные ПО ВРЕМЕНИ на train-test
2) Каждый sample - это фичи по акции за год и таргет на следующий год

In [27]:
import pandas as pd
import numpy as np


def preprocess_stock_data_with_n_years(df, n_years=3, test_split_year=2022):
    """
    Preprocess stock data to create samples with N years of historical data

    Parameters:
    -----------
    csv_file_path : str
        Path to the CSV file containing stock data
    n_years : int, default=3
        Number of years to include in each sample (N-1 previous years + current year)
    test_split_year : int, default=2022
        Year to split train/test data (data before this year = train, from this year = test)

    Returns:
    --------
    train_df : pandas.DataFrame
        Training dataset with N-year sequences
    test_df : pandas.DataFrame
        Testing dataset with N-year sequences
    feature_columns : list
        List of feature column names for the N-year windows
    """

    # Convert Date to datetime and extract year
    df["Date"] = pd.to_datetime(df["Date"])
    df["Year"] = df["Date"].dt.year

    # Sort by Symbol and Date to ensure chronological order
    df = df.sort_values(["Symbol", "Date"])

    # Function to calculate yearly features for each symbol
    def calculate_yearly_features(group):
        """
        Calculate price change, std price for each year
        """
        # Get first and last trading days of the year
        first_price = group["Adj Close"].iloc[0]
        last_price = group["Adj Close"].iloc[-1]

        # Calculate price change for the year (as percentage)
        price_change = (last_price - first_price) / first_price

        # Calculate standard deviation of prices during the year (normalized)
        std_price = group["Adj Close"].std() / first_price

        return pd.Series(
            {
                "price_change": price_change,
                "std_price": std_price,
                "first_price": first_price,
                "last_price": last_price,
            }
        )

    # Group by Symbol and Year and apply calculations
    yearly_data = (
        df.groupby(["Symbol", "Year"]).apply(calculate_yearly_features).reset_index()
    )

    # Create target variables: price_change_of_future_year and price_std_of_future_year
    yearly_data["price_change_of_future_year"] = yearly_data.groupby("Symbol")[
        "price_change"
    ].shift(-1)
    yearly_data["price_std_of_future_year"] = yearly_data.groupby("Symbol")[
        "std_price"
    ].shift(-1)

    # Drop rows where we don't have future year data (last year for each symbol)
    yearly_data = yearly_data.dropna(
        subset=["price_change_of_future_year", "price_std_of_future_year"]
    )

    # Sort by Symbol and Year
    yearly_data = yearly_data.sort_values(["Symbol", "Year"])

    # Create samples with N years of historical data
    samples = []

    # Group by Symbol to process each symbol separately
    for symbol, symbol_group in yearly_data.groupby("Symbol"):
        # Make sure the data is sorted by year
        symbol_group = symbol_group.sort_values("Year")

        # Create rolling windows of N years
        for i in range(len(symbol_group) - n_years):
            # Get the window of N years (current + N-1 previous years)
            window = symbol_group.iloc[i : i + n_years]

            # Get the target year (the year after the current window)
            target_row = symbol_group.iloc[i + n_years]

            # Create a sample dictionary
            sample = {
                "Symbol": symbol,
                "current_year": window["Year"].iloc[-1],  # Last year in the window
                "target_year": target_row["Year"],
                "price_change_of_future_year": target_row[
                    "price_change_of_future_year"
                ],
                "price_std_of_future_year": target_row["price_std_of_future_year"],
            }

            # Add features for each year in the window
            for j, (_, year_row) in enumerate(window.iterrows()):
                year_offset = -(n_years - 1 - j)  # -2, -1, 0 for n_years=3
                sample[f"price_change_year_{year_offset}"] = year_row["price_change"]
                sample[f"std_price_year_{year_offset}"] = year_row["std_price"]

            samples.append(sample)

    # Create DataFrame from samples
    n_year_df = pd.DataFrame(samples)

    # Create feature columns list (excluding metadata and target columns)
    feature_columns = [
        col
        for col in n_year_df.columns
        if col
        not in [
            "Symbol",
            "current_year",
            "target_year",
            "price_change_of_future_year",
            "price_std_of_future_year",
        ]
    ]

    # Chronological train-test split based on current_year
    train_mask = n_year_df["current_year"] < test_split_year
    test_mask = n_year_df["current_year"] >= test_split_year

    train_df = n_year_df[train_mask].copy()
    test_df = n_year_df[test_mask].copy()

    # Display shapes and sample data
    print(f"Original yearly data shape: {yearly_data.shape}")
    print(f"N-year sequences dataset shape: {n_year_df.shape}")
    print(f"Train dataset shape: {train_df.shape}")
    print(f"Test dataset shape: {test_df.shape}")
    print(f"\nSample of N-year sequences dataset (N={n_years}):")
    print(n_year_df.head(5))

    print(f"\nFeature columns for N={n_years} years:")
    print(feature_columns)

    print("\nData preprocessing complete!")
    print(f"Available symbols: {n_year_df['Symbol'].nunique()}")
    print(
        f"Current year range: {n_year_df['current_year'].min()} - {n_year_df['current_year'].max()}"
    )
    print(
        f"Target year range: {n_year_df['target_year'].min()} - {n_year_df['target_year'].max()}"
    )

    return train_df, test_df, feature_columns


N_YEARS = 3  # Number of years to include (2 previous + current)
TEST_SPLIT_YEAR = 2022

# Run preprocessing
train_df, test_df, feature_columns = preprocess_stock_data_with_n_years(
    df=df, n_years=N_YEARS, test_split_year=TEST_SPLIT_YEAR
)

# # Save processed data
# train_df.to_csv(f'train_data_n{N_YEARS}.csv', index=False)
# test_df.to_csv(f'test_data_n{N_YEARS}.csv', index=False)

Original yearly data shape: (397, 8)
N-year sequences dataset shape: (337, 11)
Train dataset shape: (321, 11)
Test dataset shape: (16, 11)

Sample of N-year sequences dataset (N=3):
  Symbol  current_year  target_year  price_change_of_future_year  \
0     AA          1980         1981                     0.276305   
1     AA          1981         1982                     0.566627   
2     AA          1982         1983                    -0.140307   
3     AA          1983         1984                     0.091712   
4     AA          1984         1985                    -0.089242   

   price_std_of_future_year  price_change_year_-2  std_price_year_-2  \
0                  0.114729              0.086581           0.079315   
1                  0.189272              0.226800           0.058640   
2                  0.081844              0.135682           0.112251   
3                  0.063682             -0.109682           0.115252   
4                  0.079152              0.276305

In [16]:
# del df

# Тестируем baseline
Изменение цены в этом году отражает динамику следующего года

In [29]:
from sklearn.metrics import r2_score

y_pred = train_df["price_change_year_0"].values
y_true = train_df["price_change_of_future_year"].values

mean_perc_error = r2_score(y_true=y_true, y_pred=y_pred)
mean_perc_error

-0.9243029606306463

In [30]:
y_pred = train_df["std_price_year_0"].values
y_true = train_df["price_std_of_future_year"].values

mean_perc_error = r2_score(y_true=y_true, y_pred=y_pred)
mean_perc_error

-0.698888688766272

# Улучшение №0: Линейная регрессия

In [21]:
y_true[:3]

array([ 0.22680007,  0.13568169, -0.10968176])