In [5]:
import pandas as pd
def get_dataframes():
    df=pd.read_csv('long_term_dataset.csv')
    symbol_dfs={Symbol:group for Symbol,group in df.groupby('Symbol')}
    return symbol_dfs

In [7]:
symbol_dfs=get_dataframes()
for symbol, symbol_df in symbol_dfs.items():
    print(f"Symbol: {symbol}, Rows: {len(symbol_df)}")


Symbol: 360ONE.NS, Rows: 869
Symbol: AAVAS.NS, Rows: 1100
Symbol: ABBOTINDIA.NS, Rows: 1967
Symbol: ABCAPITAL.NS, Rows: 1372
Symbol: ABSLAMC.NS, Rows: 358
Symbol: ACC.NS, Rows: 1967
Symbol: ADANIENT.NS, Rows: 1967
Symbol: ADANIGREEN.NS, Rows: 1175
Symbol: ADANIPOWER.NS, Rows: 1967
Symbol: AFFLE.NS, Rows: 895
Symbol: AJANTPHARM.NS, Rows: 1967
Symbol: ALKEM.NS, Rows: 1789
Symbol: ALOKINDS.NS, Rows: 1967
Symbol: AMBER.NS, Rows: 1269
Symbol: AMBUJACEM.NS, Rows: 1967
Symbol: ANANDRATHI.NS, Rows: 315
Symbol: ANGELONE.NS, Rows: 611
Symbol: APLLTD.NS, Rows: 1967
Symbol: APOLLOHOSP.NS, Rows: 1967
Symbol: APOLLOTYRE.NS, Rows: 1967
Symbol: APTUS.NS, Rows: 391
Symbol: ASAHIINDIA.NS, Rows: 1967
Symbol: ASIANPAINT.NS, Rows: 1967
Symbol: ASTERDM.NS, Rows: 1251
Symbol: ASTRAZEN.NS, Rows: 1967
Symbol: ATGL.NS, Rows: 1081
Symbol: AUBANK.NS, Rows: 1408
Symbol: AUROPHARMA.NS, Rows: 1967
Symbol: AXISBANK.NS, Rows: 1967
Symbol: BAJAJ-AUTO.NS, Rows: 1967
Symbol: BAJAJFINSV.NS, Rows: 1967
Symbol: BAJAJHLDNG.N

AttributeError: 'dict' object has no attribute 'len'

In [20]:
symbol_dfs['360ONE.NS'].info()


<class 'pandas.core.frame.DataFrame'>
Index: 869 entries, 0 to 868
Data columns (total 97 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       869 non-null    object 
 1   Symbol                     869 non-null    object 
 2   Open                       869 non-null    float64
 3   Close_Today                869 non-null    float64
 4   Close_After_Year           869 non-null    float64
 5   Target_Return              869 non-null    float64
 6   High                       869 non-null    float64
 7   Low                        869 non-null    float64
 8   Volume                     869 non-null    int64  
 9   Dividends                  869 non-null    float64
 10  Stock Splits               869 non-null    float64
 11  volume_adi                 869 non-null    float64
 12  volume_obv                 869 non-null    int64  
 13  volume_cmf                 869 non-null    float64
 14 

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def split_data(df, test_size=0.15, val_size=0.15, target_col='Target_Return'):
    # Extract features and target
    feature_cols = df.columns.difference(['Date', 'Symbol', 'Target_Return', 'Close_After_Year', 'Close_Today'])
    X = df[feature_cols]
    y = df[target_col]

    # First split to get test set
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)

    # Then split remaining into train/val
    val_ratio = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_ratio, shuffle=False)

    return X_train, X_val, X_test, y_train, y_val, y_test, feature_cols

def select_best_features(X_train, X_val, y_train, y_val, feature_cols, max_features=20):
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    feature_importances = pd.Series(rf.feature_importances_, index=feature_cols)
    top_features = feature_importances.sort_values(ascending=False).head(max_features).index.tolist()

    # Evaluate performance using top features
    rf_top = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_top.fit(X_train[top_features], y_train)
    y_pred = rf_top.predict(X_val[top_features])
    val_loss = mean_squared_error(y_val, y_pred)

    return top_features, val_loss

def process_all_symbols(symbol_dfs, max_features=20):
    results = {}

    for symbol, df in symbol_dfs.items():
        try:
            X_train, X_val, X_test, y_train, y_val, y_test, feature_cols = split_data(df)
            top_feats, loss = select_best_features(X_train, X_val, y_train, y_val, feature_cols, max_features=max_features)
            results[symbol] = {
                'top_features': top_feats,
                'val_loss': loss
            }
            print(f"✅ {symbol}: Top {len(top_feats)} features, Validation MSE = {loss:.5f}")
        except Exception as e:
            print(f"❌ Error processing {symbol}: {e}")

    return results


In [23]:
symbol_dfs = get_dataframes()  # From your previous step
selection_results = process_all_symbols(symbol_dfs, max_features=25)

# Save selected features to JSON
import json
with open("selected_features.json", "w") as f:
    json.dump(selection_results, f, indent=4)


✅ 360ONE.NS: Top 25 features, Validation MSE = 0.29933
✅ AAVAS.NS: Top 25 features, Validation MSE = 0.10463
✅ ABBOTINDIA.NS: Top 25 features, Validation MSE = 0.05632
✅ ABCAPITAL.NS: Top 25 features, Validation MSE = 0.18189
✅ ABSLAMC.NS: Top 25 features, Validation MSE = 0.01431
✅ ACC.NS: Top 25 features, Validation MSE = 0.04069
✅ ADANIENT.NS: Top 25 features, Validation MSE = 2.16738
✅ ADANIGREEN.NS: Top 25 features, Validation MSE = 1.97732
✅ ADANIPOWER.NS: Top 25 features, Validation MSE = 2.38758
✅ AFFLE.NS: Top 25 features, Validation MSE = 0.05693
✅ AJANTPHARM.NS: Top 25 features, Validation MSE = 0.18707
✅ ALKEM.NS: Top 25 features, Validation MSE = 0.15899
✅ ALOKINDS.NS: Top 25 features, Validation MSE = 0.18408
✅ AMBER.NS: Top 25 features, Validation MSE = 1.38475
✅ AMBUJACEM.NS: Top 25 features, Validation MSE = 0.56507
✅ ANANDRATHI.NS: Top 25 features, Validation MSE = 0.27023
✅ ANGELONE.NS: Top 25 features, Validation MSE = 0.98454
✅ APLLTD.NS: Top 25 features, Validatio