# Stock exchange prediction

## Data preparation

This code will unzip the data archive into data folder

In [None]:
import shutil
import zipfile
import os

dirs_to_remove = ['data/ETFs', 'data/Stocks']
for d in dirs_to_remove:
    if os.path.exists(d):
        shutil.rmtree(d)

with zipfile.ZipFile('data/data.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

## Loading data

In [None]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import glob
import torch

class StockDataset(Dataset):
    def __init__(self, folder):
        self.files = glob.glob(os.path.join(folder, '*.txt'))
        self.data = []
        for file in self.files:
            try:
                df = pd.read_csv(file, sep=',')
            except Exception:
                continue
            if 'Close' not in df.columns or 'Date' not in df.columns:
                continue
            df = df.sort_values('Date')
            self.data.append(df)
        self.lengths = [len(df) for df in self.data]
        self.cumulative_lengths = [sum(self.lengths[:i+1]) for i in range(len(self.lengths))]

    def __len__(self):
        return sum(self.lengths)

    def __getitem__(self, idx):
        for i, cum_len in enumerate(self.cumulative_lengths):
            if idx < cum_len:
                df = self.data[i]
                row_idx = idx if i == 0 else idx - self.cumulative_lengths[i-1]
                row = df.iloc[row_idx]
                # TODO(Jakub Drzewiecki): Unsure if we want to drop Date column
                features = row.drop(['Date', 'Close'])
                features = pd.to_numeric(features, errors='coerce').astype('float32')
                features_tensor = torch.tensor(features.values, dtype=torch.float32)
                target = row['Close']
                target_tensor = torch.tensor(target, dtype=torch.float32)
                return features_tensor, target_tensor
        raise IndexError("Index out of range")

def split_dataset(dataset, train_ratio=0.8):
    total_len = len(dataset)
    train_len = int(total_len * train_ratio)
    indices = list(range(total_len))
    train_indices = indices[:train_len]
    test_indices = indices[train_len:]
    return torch.utils.data.Subset(dataset, train_indices), torch.utils.data.Subset(dataset, test_indices)

etf_dataset = StockDataset('data/ETFs')
stock_dataset = StockDataset('data/Stocks')

etf_train, etf_test = split_dataset(etf_dataset)
stock_train, stock_test = split_dataset(stock_dataset)

etf_loader = DataLoader(etf_train, batch_size=32, shuffle=True)
etf_test_loader = DataLoader(etf_test, batch_size=32, shuffle=False)
stock_loader = DataLoader(stock_train, batch_size=32, shuffle=True)
stock_test_loader = DataLoader(stock_test, batch_size=32, shuffle=False)