In [4]:
# quant_scoring.py
# Enhanced ML pipeline: scores CSE companies by trading data + macroeconomic indicators

import os
import glob
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# --- Configuration ---
DATA_DIR = "SPSL20/"          # folder with per-company CSVs
MACRO_FILES = {
    'gdp_growth': 'GDP growth.csv',
    'gdp_per_capita': 'GDP per Capita.csv',
    'inflation': 'Inflation.csv'
}
FEATURE_COLS = [
    'mean_return', 'volatility', 'sharpe_ratio', 'max_drawdown',
    'momentum_20d', 'volume_change', 'turnover_change'
]

# --- Utility ---
def clean_numeric_series(s: pd.Series) -> pd.Series:
    return s.astype(str).str.replace(',', '').astype(float)

# --- Company Data Loader ---
def load_and_process(file_path: str) -> pd.Series:
    df = pd.read_csv(file_path, parse_dates=['Trade Date'], dayfirst=True, dtype=str)
    for col in ['Open (Rs.)','High (Rs.)','Low (Rs.)','Close (Rs.)',
                'TradeVolume','ShareVolume','Turnover (Rs.)']:
        df[col] = clean_numeric_series(df[col])
    df.sort_values('Trade Date', inplace=True)
    df['return'] = df['Close (Rs.)'].pct_change()

    # Calculate metrics
    mean_return = df['return'].mean()
    volatility = df['return'].std()
    sharpe_ratio = (mean_return/volatility)*np.sqrt(252) if volatility else 0
    cum = (1+df['return']).cumprod()
    draw = cum / cum.cummax() - 1
    max_dd = draw.min()
    momentum = (df['Close (Rs.)'].iloc[-1] / df['Close (Rs.)'].iloc[-21] - 1) if len(df)>=21 else np.nan

    if len(df)>=40:
        vol20 = df['ShareVolume'].iloc[-20:].mean()
        vol40 = df['ShareVolume'].iloc[-40:-20].mean()
        toc20 = df['Turnover (Rs.)'].iloc[-20:].mean()
        toc40 = df['Turnover (Rs.)'].iloc[-40:-20].mean()
        vol_change = (vol20/vol40 - 1) if vol40 else np.nan
        turn_change = (toc20/toc40 - 1) if toc40 else np.nan
    else:
        vol_change = turn_change = np.nan

    return pd.Series([
        mean_return, volatility, sharpe_ratio, max_dd,
        momentum, vol_change, turn_change
    ], index=FEATURE_COLS)

# --- Macroeconomic Loader ---
def load_macro_series(path: str) -> pd.Series:
    if not os.path.isfile(path):
        raise FileNotFoundError(f"Macro file not found: {path}")
    df = pd.read_csv(path, header=None)
    years = df.iloc[0].astype(int).tolist()
    values = df.iloc[1].astype(float).tolist()
    return pd.Series(data=values, index=years)

# --- Build Features ---
def build_feature_matrix() -> pd.DataFrame:
    # Company features
    files = glob.glob(os.path.join(DATA_DIR, '*.csv')) + glob.glob(os.path.join(DATA_DIR, '*.CSV'))
    if not files:
        raise ValueError(f"No CSV files found in data directory '{DATA_DIR}'")

    feats, tickers = [], []
    for f in files:
        try:
            feats.append(load_and_process(f))
            tickers.append(os.path.splitext(os.path.basename(f))[0])
        except Exception as e:
            print(f"Skipping {f}: {e}")

    company_df = pd.DataFrame(feats, index=tickers)
    print(f"Loaded company features: {company_df.shape}")

    # Macro features
    macro_data = {}
    for name, path in MACRO_FILES.items():
        try:
            series = load_macro_series(path)
        except Exception as e:
            raise
        latest = series.index.max()
        macro_data[f"{name}_latest"] = series[latest]
        macro_data[f"{name}_yoy"] = (series[latest]/series.get(latest-1, np.nan) - 1)

    # Combine
    macro_df = pd.DataFrame([macro_data] * len(company_df), index=company_df.index)
    full_df = pd.concat([company_df, macro_df], axis=1)
    print(f"Combined feature matrix: {full_df.shape}")
    return full_df

# --- Scoring ---
def compute_scores(feature_df: pd.DataFrame) -> pd.DataFrame:
    print(f"Scoring features of shape: {feature_df.shape}")
    if feature_df.empty or feature_df.shape[1] == 0:
        raise ValueError(f"Empty feature DataFrame {feature_df.shape}, check data inputs.")
    # Impute & scale
    feature_df = feature_df.fillna(feature_df.mean())
    scaler = StandardScaler()
    scaled = scaler.fit_transform(feature_df)

    # PCA -> score
    pca = PCA(n_components=1)
    raw = pca.fit_transform(scaled).flatten()
    score_pct = pd.Series(raw, index=feature_df.index).rank(pct=True)
    return pd.DataFrame({'score': score_pct})

# --- Main ---
def main():
    features = build_feature_matrix()
    scores = compute_scores(features)
    scores.to_csv('SPSL20 company_scores_with_macro.csv')
    print(scores.head())

if __name__ == '__main__':
    main()


Loaded company features: (44, 7)
Combined feature matrix: (44, 13)
Scoring features of shape: (44, 13)
                 score
trades_CCS    0.977273
trades_COMBN  0.295455
trades_COMBX  0.340909
trades_DFCC   0.534091
trades_DIAL   0.034091
