In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import files
uploaded = files.upload()

Saving synthetic_nifty50.csv to synthetic_nifty50.csv


In [None]:
df = pd.read_csv('synthetic_nifty50.csv')
df.head()

Unnamed: 0,Date,Stock,Open,High,Low,Close,Volume
0,2018-01-01,STOCK_1,413.050297,414.995717,405.316197,410.008963,8659973
1,2018-01-02,STOCK_1,408.454368,416.730164,408.799659,408.803855,3874853
2,2018-01-03,STOCK_1,408.444774,419.520963,404.238471,411.904878,2061189
3,2018-01-04,STOCK_1,421.530235,424.997934,417.329222,420.579207,6114596
4,2018-01-05,STOCK_1,416.487741,426.378928,417.310594,419.802902,3548327


In [None]:
df.isnull().sum()

Unnamed: 0,0
Date,0
Stock,0
Open,0
High,0
Low,0
Close,0
Volume,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62500 entries, 0 to 62499
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    62500 non-null  object 
 1   Stock   62500 non-null  object 
 2   Open    62500 non-null  float64
 3   High    62500 non-null  float64
 4   Low     62500 non-null  float64
 5   Close   62500 non-null  float64
 6   Volume  62500 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 3.3+ MB


In [None]:
df["Date"] = pd.to_datetime(df["Date"])
df["Stock"] = df["Stock"].astype("category").cat.codes
df.head()

Unnamed: 0,Date,Stock,Open,High,Low,Close,Volume
0,2018-01-01,0,413.050297,414.995717,405.316197,410.008963,8659973
1,2018-01-02,0,408.454368,416.730164,408.799659,408.803855,3874853
2,2018-01-03,0,408.444774,419.520963,404.238471,411.904878,2061189
3,2018-01-04,0,421.530235,424.997934,417.329222,420.579207,6114596
4,2018-01-05,0,416.487741,426.378928,417.310594,419.802902,3548327


In [None]:
# Sort by StockID then Date to be safe
df = df.sort_values(["Stock", "Date"])

# Group by StockID and calculate log returns using transform
df["LogReturn"] = df.groupby("Stock")["Close"].transform(
    lambda x: np.log(x / x.shift(1))
)

# Preview
print(df.head(10))

# Check how many NaNs were created (first row per stock will be NaN)
print("\nNaNs in LogReturn:", df["LogReturn"].isna().sum())

        Date  Stock        Open        High         Low       Close   Volume  \
0 2018-01-01      0  413.050297  414.995717  405.316197  410.008963  8659973   
1 2018-01-02      0  408.454368  416.730164  408.799659  408.803855  3874853   
2 2018-01-03      0  408.444774  419.520963  404.238471  411.904878  2061189   
3 2018-01-04      0  421.530235  424.997934  417.329222  420.579207  6114596   
4 2018-01-05      0  416.487741  426.378928  417.310594  419.802902  3548327   
5 2018-01-08      0  413.090963  423.019570  413.974316  415.074707  2789464   
6 2018-01-09      0  409.623874  414.860675  402.110994  407.189098  2788565   
7 2018-01-10      0  406.345226  409.442279  403.830865  405.539264  1406688   
8 2018-01-11      0  407.710671  415.186128  402.742975  409.815442  4095319   
9 2018-01-12      0  407.472124  406.378448  398.350100  405.548390  7473201   

   LogReturn  
0        NaN  
1  -0.002944  
2   0.007557  
3   0.020840  
4  -0.001848  
5  -0.011327  
6  -0.019181  

In [None]:
df = df.dropna(subset=["LogReturn"])


In [None]:
# Rolling volatility and momentum per stock
window_sizes = [10, 20, 60]

for w in window_sizes:
    # Rolling volatility (standard deviation)
    df[f"Vol_{w}"] = df.groupby("Stock")["LogReturn"].transform(
        lambda x: x.rolling(w).std()
    )

    # Rolling momentum (mean return)
    df[f"Mom_{w}"] = df.groupby("Stock")["LogReturn"].transform(
        lambda x: x.rolling(w).mean()
    )

print("✅ Rolling features created!")
print(df.head(10))


✅ Rolling features created!
         Date  Stock        Open        High         Low       Close   Volume  \
1  2018-01-02      0  408.454368  416.730164  408.799659  408.803855  3874853   
2  2018-01-03      0  408.444774  419.520963  404.238471  411.904878  2061189   
3  2018-01-04      0  421.530235  424.997934  417.329222  420.579207  6114596   
4  2018-01-05      0  416.487741  426.378928  417.310594  419.802902  3548327   
5  2018-01-08      0  413.090963  423.019570  413.974316  415.074707  2789464   
6  2018-01-09      0  409.623874  414.860675  402.110994  407.189098  2788565   
7  2018-01-10      0  406.345226  409.442279  403.830865  405.539264  1406688   
8  2018-01-11      0  407.710671  415.186128  402.742975  409.815442  4095319   
9  2018-01-12      0  407.472124  406.378448  398.350100  405.548390  7473201   
10 2018-01-15      0  402.188266  407.849109  400.716082  402.786973  9024053   

    LogReturn    Vol_10    Mom_10  Vol_20  Mom_20  Vol_60  Mom_60  
1   -0.00294

In [None]:
# Keep only rows where rolling features exist
df = df.dropna(subset=["Vol_60", "Mom_60"])

print("✅ Cleaned DataFrame shape:", df.shape)
print(df.head())


✅ Cleaned DataFrame shape: (59500, 14)
         Date  Stock        Open        High         Low       Close   Volume  \
60 2018-03-26      0  449.361557  457.370477  445.154061  450.027100  1133255   
61 2018-03-27      0  456.823877  462.912100  450.920930  454.172575  2267331   
62 2018-03-28      0  448.856032  457.469435  445.422329  452.418693  9297241   
63 2018-03-29      0  448.006484  453.820609  443.166714  451.739078  3970433   
64 2018-03-30      0  456.364975  454.629745  448.341851  453.659884  6428751   

    LogReturn    Vol_10    Mom_10    Vol_20    Mom_20    Vol_60    Mom_60  
60   0.002653  0.004583  0.003581  0.005344  0.002938  0.009249  0.001552  
61   0.009169  0.004330  0.004754  0.005517  0.003290  0.009281  0.001754  
62  -0.003869  0.003830  0.002997  0.005737  0.003014  0.009277  0.001564  
63  -0.001503  0.004074  0.002488  0.005722  0.003032  0.008932  0.001191  
64   0.004243  0.004094  0.002539  0.005535  0.002777  0.008932  0.001293  


In [None]:
df.to_csv('Stock_pre.csv',index=False)
files.download('Stock_pre.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>