In [1]:
import json

In [2]:
class Config:
    ratios_path = '/home/baskar/Desktop/StockPricePred/data/FMP'
    prices_path = '/home/baskar/Desktop/StockPricePred/data/NASDAQ_100'

config = Config()

In [3]:
import json
import pandas as pd
from pathlib import Path
import numpy as np
import os

class RatiosProcessor:
    def __init__(self, ratios_path:str):
        self.ratios_path = Path(ratios_path)
    
    def load_ratios(self):
        all_data = []

        for filename in os.listdir(self.ratios_path):
            if filename.endswith('.json'):
                file_path = os.path.join(self.ratios_path, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        data = json.load(file)

                        if isinstance(data, dict):
                            all_data.append(data)
                        elif isinstance(data, list):
                            all_data.extend(data)
                        else:
                            print(f"Skipping {filename}: Unsupported JSON format")
                except Exception as e:
                    print(f"Error reading {filename}: {e}")

        return pd.DataFrame(all_data)
    
    def fill_missing_values(self):
        ratios_df = self.load_ratios()
        ratios_df['date'] = pd.to_datetime(ratios_df['date'])
        ratios_df = ratios_df.sort_values(by=['symbol', 'date']).reset_index(drop=True)

        numeric_cols = ratios_df.select_dtypes(include='number').columns
        ratios_df_filled = ratios_df.copy()

        for symbol in ratios_df['symbol'].unique():
            symbol_mask = ratios_df['symbol'] == symbol #note
            symbol_df = ratios_df[symbol_mask].sort_values(by='date').reset_index()

            for col in numeric_cols:
                for i in range(len(symbol_df)):
                    val = symbol_df.loc[i, col] #note
                    if pd.isna(val):
                        original_idx = symbol_df.loc[i, 'index']
                        prev_val = np.nan
                        next_val = np.nan

                        #look for previous value
                        for j in range(i-1, -1 -1):
                            prev_val = symbol_df.loc[j, col]
                            if not pd.isna(prev_val):
                                ratios_df_filled.at[original_idx, col] = prev_val
                                break
                            else:
                                for k in range(i+1, len(symbol_df)):
                                    next_val = symbol_df.loc[k, col]
                                    if not pd.isna(next_val):
                                        ratios_df_filled.at[original_idx, col] = next_val
                                        break
        return ratios_df_filled

    def get_missing_quarters(self):
        ratios_df = self.fill_missing_values()
        ratios_df['calendarYear'] = ratios_df['calendarYear'].astype(str)
        ratios_df['period'] = ratios_df['period'].str.upper()

        all_quarters = ['Q1', 'Q2', 'Q3', 'Q4']
        quarter_months = {'Q1': '03-31', 'Q2': '06-30', 'Q3': '09-30', 'Q4': '12-31'}

        rows_to_add = []
        grouped = ratios_df.groupby(['symbol', 'calendarYear'])

        for (symbol, year), group in grouped:
            present_quarters = set(group['period'].tolist())

            for i, quarter in enumerate(all_quarters):
                if quarter not in present_quarters:
                    new_row = None

                    # Try to back-fill from earlier quarters
                    for prev_q in all_quarters[:i][::-1]:
                        prev_rows = group[group['period'] == prev_q]
                        if not prev_rows.empty:
                            new_row = prev_rows.iloc[0].copy()
                            break

                    # If not found, try to forward-fill from next quarters
                    if new_row is None:
                        for next_q in all_quarters[i+1:]:
                            next_rows = group[group['period'] == next_q]
                            if not next_rows.empty:
                                new_row = next_rows.iloc[0].copy()
                                break

                    if new_row is not None:
                        new_row = new_row.copy()
                        new_row['period'] = quarter
                        new_row['date'] = pd.to_datetime(f"{year}-{quarter_months[quarter]}")
                        rows_to_add.append(new_row)

        # Add filled rows and sort
        ratios_df_filled = pd.concat([ratios_df, pd.DataFrame(rows_to_add)], ignore_index=True)
        ratios_df_filled = ratios_df_filled.sort_values(by=['symbol', 'calendarYear', 'period']).reset_index(drop=True)

        return ratios_df_filled


rp = RatiosProcessor(config.ratios_path)
ratios_df = rp.get_missing_quarters()

In [4]:
import os
import json
import pandas as pd
from pathlib import Path
from typing import Dict

class PriceDataProcessor:
    def __init__(self, data_path: str):
        
        self.data_path = Path(data_path)
        self.symbol_dataframes: Dict[str, pd.DataFrame] = {}

    def load_price_data(self) -> Dict[str, pd.DataFrame]:
        if not self.data_path.exists() or not self.data_path.is_dir():
            raise FileNotFoundError(f"Invalid directory path: {self.data_path}")

        for file in self.data_path.glob("*.json"):
            try:
                symbol = file.stem.split('_')[0]  # Get symbol from filename
                with open(file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                if isinstance(data, list):
                    df = pd.DataFrame(data)
                    df['date'] = pd.to_datetime(df['date'])
                    df = df.sort_values(by='date').reset_index(drop=True)
                    
                    if symbol in self.symbol_dataframes:
                        self.symbol_dataframes[symbol] = pd.concat(
                            [self.symbol_dataframes[symbol], df],
                            ignore_index=True
                        ).sort_values(by='date').reset_index(drop=True)
                    else:
                        self.symbol_dataframes[symbol] = df
                else:
                    print(f"Skipping {file.name}: Expected a list of records.")
            except Exception as e:
                print(f"Error processing {file.name}: {e}")

        return self.symbol_dataframes


In [11]:
processor = PriceDataProcessor(data_path=config.prices_path)
symbol_dfs = processor.load_price_data()

# Access DataFrame for a specific symbol
aapl_df = symbol_dfs.get("AAPL")
#symbol_dfs.head()
aapl_df.head()


Error processing INTC_10Y_DAILY.json: 'date'
Error processing ISRG_10Y_DAILY.json: 'date'


Unnamed: 0,date,close,high,low,open,volume,adjClose,adjHigh,adjLow,adjOpen,adjVolume,divCash,splitFactor
0,1980-12-12 00:00:00+00:00,28.75,28.87,28.75,28.75,2093900,0.098852,0.099265,0.098852,0.098852,469034069,0.0,1.0
1,1980-12-15 00:00:00+00:00,27.25,27.38,27.25,27.38,785200,0.093695,0.094142,0.093695,0.094142,175884975,0.0,1.0
2,1980-12-16 00:00:00+00:00,25.25,25.37,25.25,25.37,472000,0.086818,0.087231,0.086818,0.087231,105728105,0.0,1.0
3,1980-12-17 00:00:00+00:00,25.87,26.0,25.87,25.87,385900,0.08895,0.089397,0.08895,0.08895,86441686,0.0,1.0
4,1980-12-18 00:00:00+00:00,26.63,26.75,26.63,26.63,327900,0.091563,0.091976,0.091563,0.091563,73449673,0.0,1.0


In [12]:
import numpy as np
import pandas as pd

class DataGen:
    def __init__(self, ratios_df: pd.DataFrame, price_df: pd.DataFrame):
        self.ratios_df = ratios_df.copy()
        self.price_df = price_df.copy()
        self._prepare_data()

    def _prepare_data(self):
        self.ratios_df['date'] = pd.to_datetime(self.ratios_df['date']).dt.tz_localize(None)
        self.price_df['date'] = pd.to_datetime(self.price_df['date']).dt.tz_localize(None)
        
        self.ratios_df = self.ratios_df.sort_values(by='date').reset_index(drop=True)
        self.price_df = self.price_df.sort_values(by='date').reset_index(drop=True)

    def label_single_symbol(self, symbol: str, start: int = 0, end: int = None) -> dict:
        symbol_df = self.ratios_df[self.ratios_df['symbol'] == symbol].reset_index(drop=True)
        if symbol_df.empty:
            raise ValueError(f"No data found for symbol: {symbol}")

        total_periods = len(symbol_df)
        end = total_periods - 1 if end is None else end

        if start < 0 or end >= total_periods or start > end:
            raise ValueError(f"Invalid range: start={start}, end={end}, available={total_periods}")

        output = {}

        for i in range(start, end + 1):
            row = symbol_df.iloc[i]
            date = row['date']

            # Get closest current and future price
            current_row = self.price_df[self.price_df['date'] >= date].head(1)
            if current_row.empty:
                continue

            future_date = date + pd.Timedelta(days=30)
            future_row = self.price_df[self.price_df['date'] >= future_date].head(1)
            if future_row.empty:
                continue

            current_price = current_row['adjClose'].values[0]
            future_price = future_row['adjClose'].values[0]

            change_pct = (future_price - current_price) / current_price * 100
            label = 'BUY' if change_pct > 5 else 'SELL'

            # Extract numeric features only
            ratio_values = row.drop(labels=['symbol', 'date'], errors='ignore')
            ratio_values = [v for v in ratio_values.values if np.issubdtype(type(v), np.number) and not pd.isna(v)]

            output[i] = [ratio_values, label]

        output['symbol'] = symbol
        return output


In [13]:
# 1. Process and get filled ratios
ratios_processor = RatiosProcessor(config.ratios_path)
ratios_df_filled = ratios_processor.get_missing_quarters()

# 2. Load stock prices
price_processor = PriceDataProcessor(config.prices_path)
price_data = price_processor.load_price_data()

# 3. Choose a symbol
symbol = "AAPL"
price_df_symbol = price_data.get(symbol)
if price_df_symbol is not None:
    data_gen = DataGen(ratios_df=ratios_df_filled, price_df=price_df_symbol)
    result = data_gen.label_single_symbol(symbol=symbol, start=0, end=10)
    result
else:
    print(f"No price data available for {symbol}")


Error processing INTC_10Y_DAILY.json: 'date'
Error processing ISRG_10Y_DAILY.json: 'date'


In [14]:
print(result)

{0: [[np.float64(2.7830060934326335), np.float64(2.217670954637779), np.float64(1.140825998645904), np.float64(48.37197949719307), np.float64(0.0), np.float64(48.37197949719307), np.float64(0.0), np.float64(48.37197949719307), np.float64(1.0), np.float64(-3.3226751281425435), np.float64(-6.645350256285087), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(-2.1243757802746566), np.float64(0.0), np.float64(2.0), np.float64(-3.3226751281425435), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(1.7006357856494096), np.float64(1.8605812897366032), np.float64(0.0), np.float64(0.0), np.float64(4.532079646017699), np.float64(0.43762016663106174), np.float64(0.024324402355930246), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(1.7695378252497729), np.float64(1.7695378252497729), np.float64(2.3776679834024894), np.float64

In [15]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

class XGBoost:
    def __init__(self, output_dict: dict):
        self.output_dict = output_dict
        self.X, self.y = self._convert_output_to_xy()
        self.model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        self.is_trained = False

    def _convert_output_to_xy(self):
        X, y = [], []
        for k, v in self.output_dict.items():
            if k == 'symbol':
                continue
            features, label = v
            X.append(features)
            y.append(1 if label == 'BUY' else 0)
        return np.array(X), np.array(y)

    def train(self, test_size=0.3, random_state=42):
        if len(self.X) < 2:
            raise ValueError("Not enough samples to train. You need at least 2.")

        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y,
            test_size=test_size,
            random_state=random_state,
            stratify=self.y if len(set(self.y)) > 1 else None
        )

        self.model.fit(X_train, y_train)
        self.is_trained = True

        y_pred = self.model.predict(X_test)

        print("Model Trained.")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

    def predict(self, features: list):
        prediction = self.model.predict([features])[0]
        return "BUY" if prediction == 1 else "SELL"


In [16]:
# Step 1: Pass your labeled dictionary from DataGen
xgb_model = XGBoost(result)

# Step 2: Train the classifier
xgb_model.train()

# Step 3: Predict on a new quarter's ratios
some_ratios = result[0][0]  # just an example from existing
xgb_model.predict(some_ratios)  # returns "BUY" or "SELL"


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (11,) + inhomogeneous part.