In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import json
import seaborn as sns

pd.set_option('display.max_columns', None)

In [2]:
##################################################
nq = pd.read_csv('../../rawdata/NQ/NQ_1m_24_Jan_25_Goose.csv')
##################################################
es = pd.read_csv('../../rawdata/ES/ES_1m_24_Jan_25_Goose.csv')
##################################################
ym = pd.read_csv('../../rawdata/YM/YM_1m_24_Jan_25_Goose.csv')
##################################################
rty = pd.read_csv('../../rawdata/RTY/RTY_1m_24_Jan_25_Goose.csv')

markets = {"NQ" : nq, "ES" : es, "YM" : ym, "RTY" : rty}


In [3]:
ym.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347403 entries, 0 to 347402
Data columns (total 39 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   DateTime                        347403 non-null  object 
 1   Open                            347403 non-null  int64  
 2   High                            347403 non-null  int64  
 3   Low                             347403 non-null  int64  
 4   Close                           347403 non-null  int64  
 5   Ticks(from bar)                 347403 non-null  int64  
 6   Volume(from bar)                347403 non-null  int64  
 7   Goose Zones_MA Line             347203 non-null  float64
 8   Goose Zones_Upper Keltner       347203 non-null  float64
 9   Goose Zones_Lower Keltner       347203 non-null  float64
 10  Goose Zones_Short Trigger       347204 non-null  float64
 11  Goose Zones_Short Stop          347204 non-null  float64
 12  Goose Zones_Long

In [4]:
# Loop through all market dfs to apply the same cleaning procedures
for mark, df in markets.items():
    # Get rid of columns after goose indicator cols, since they are not needed
    df = df.iloc[:,:17]
    # Make the columns lowercase
    df.columns = df.columns.str.lower()
    # Trim to ensure no odd spacing
    df.columns = df.columns.str.strip()
    # Get rid of sub string "Goose Zones_"
    df.columns = df.columns.str.replace("goose zones_", "")    
    # Replace space with _
    df.columns = df.columns.str.replace(" ", "_")
    # Get rid of substring "(from bar)" since there are no more duplicates of that column
    df.columns = df.columns.str.replace("(from_bar)", "")

    # Specify format
    datetime_format = "%m/%d/%Y %I:%M:%S %p %z"

    # Convert the column to pandas datetime
    df["datetime"] = pd.to_datetime(df["datetime"], format=datetime_format)

    # Convert to EST
    df['datetime'] = df['datetime'].dt.tz_convert("US/Eastern")

    # limit dfs to non-null rows

    df = df[(df['goose_momentum_goosemomentum_2'].notna())]

    markets[mark] = df



In [5]:
markets["NQ"].info()

<class 'pandas.core.frame.DataFrame'>
Index: 352954 entries, 222 to 353175
Data columns (total 17 columns):
 #   Column                          Non-Null Count   Dtype                     
---  ------                          --------------   -----                     
 0   datetime                        352954 non-null  datetime64[ns, US/Eastern]
 1   open                            352954 non-null  float64                   
 2   high                            352954 non-null  float64                   
 3   low                             352954 non-null  float64                   
 4   close                           352954 non-null  float64                   
 5   ticks                           352954 non-null  int64                     
 6   volume                          352954 non-null  int64                     
 7   ma_line                         352954 non-null  float64                   
 8   upper_keltner                   352954 non-null  float64                   
 

In [6]:
markets["NQ"].head()

Unnamed: 0,datetime,open,high,low,close,ticks,volume,ma_line,upper_keltner,lower_keltner,short_trigger,short_stop,long_trigger,long_stop,arrow_marker,goose_momentum_goosemomentum,goose_momentum_goosemomentum_2
222,2024-01-25 21:00:00-05:00,17519.25,17520.75,17519.0,17520.75,42,44,17512.247485,17542.062779,17482.432191,17692.25,17750.0,17576.75,17519.0,,-87.546984,-88.426719
223,2024-01-25 21:01:00-05:00,17520.75,17524.5,17519.75,17522.25,187,202,17512.347012,17542.314642,17482.379383,17692.25,17750.0,17576.75,17519.0,,-86.385969,-87.406344
224,2024-01-25 21:02:00-05:00,17522.5,17524.75,17522.25,17523.5,95,106,17512.457987,17542.36375,17482.552225,17692.25,17750.0,17576.75,17519.0,,-85.091205,-86.248774
225,2024-01-25 21:03:00-05:00,17523.5,17525.5,17523.5,17524.75,71,86,17512.580296,17542.377543,17482.783048,17692.25,17750.0,17576.75,17519.0,,-83.651964,-84.950369
226,2024-01-25 21:04:00-05:00,17525.0,17526.25,17525.0,17526.0,79,79,17512.713825,17542.356374,17483.071277,17692.25,17750.0,17576.75,17519.0,,-82.074021,-83.512195


In [7]:
# Ok now i need to group by sessions

# Group sessions by date, basically test for globex, assign date, test for globex (12am - 8:20EST) assign to same date, then test for RTH and then do the same assign to previous date 
# for session label. Then rinse repeat, we have session dates as the key for our functions. 

def assign_session(row):
    candle_dt = row['datetime']

    hour = candle_dt.hour
    minute = candle_dt.minute
    date = candle_dt.date()
    prev_date = date - timedelta(days=1)

    if hour >= 18:
        row['sessionDate'] = date
        row['sessionType'] = "GBX"
    elif ((hour >= 0) and (hour < 9 or (hour == 9 and minute < 30))):
        row['sessionDate'] = prev_date
        row['sessionType'] = "GBX"
    elif ((hour == 9 and minute >= 30) or ((hour > 9) and (hour < 18))):
        row['sessionDate'] = prev_date
        row['sessionType'] = "RTH"
    else:
        print("aint do shit dumbass")
        print(str(hour) + " " + str(minute))

    return row


def assign_session_goose(row):
    candle_dt = row['datetime']

    hour = candle_dt.hour
    minute = candle_dt.minute
    date = candle_dt.date()
    prev_date = date - timedelta(days=1)

    if hour >= 18:
        row['sessionDate'] = date
        row['sessionType'] = "GBX"
    elif ((hour >= 0) and (hour < 8 or (hour == 8 and minute < 20))):
        row['sessionDate'] = prev_date
        row['sessionType'] = "GBX"
    elif ((hour == 8 and minute >= 20) or ((hour > 8) and (hour < 18))):
        row['sessionDate'] = prev_date
        row['sessionType'] = "RTH"
    else:
        print("aint do shit dumbass")
        print(str(hour) + " " + str(minute))

    return row

def assign_session_goose_two(row):
    candle_dt = row['datetime']

    hour = candle_dt.hour
    minute = candle_dt.minute
    date = candle_dt.date()
    prev_date = date - timedelta(days=1)

    if hour >= 18:
        row['sessionDate'] = date
        row['sessionType'] = "GBX"
    elif ((hour >= 0) and (hour < 8)):
        row['sessionDate'] = prev_date
        row['sessionType'] = "GBX"
    elif ((hour >= 8) and (hour < 18)):
        row['sessionDate'] = prev_date
        row['sessionType'] = "RTH"
    else:
        print("aint do shit dumbass")
        print(str(hour) + " " + str(minute))

    return row

def assign_session_goose_three(row):
    candle_dt = row['datetime']

    hour = candle_dt.hour
    minute = candle_dt.minute
    date = candle_dt.date()
    prev_date = date - timedelta(days=1)

    if hour >= 19:
        row['sessionDate'] = date
        row['sessionType'] = "GBX"
    elif ((hour >= 0) and (hour < 8)):
        row['sessionDate'] = prev_date
        row['sessionType'] = "GBX"
    elif ((hour >= 8) and (hour < 18)):
        row['sessionDate'] = prev_date
        row['sessionType'] = "RTH"
    else:
        print("aint do shit dumbass")
        print(str(hour) + " " + str(minute))

    return row
        

# Loop through all market dfs
        
for mark, df in markets.items():
    # Instantiate empty columns "sessionType" and "sessionDate"
    df['sessionDate'] = None
    df['sessionType'] = None

    # Apply function defined above to df
    df = df.apply(assign_session_goose_three, axis=1)

    markets[mark] = df

markets['NQ'].head()
markets['NQ'].to_csv('NQ_breaks.csv', index=False)


        

aint do shit dumbass
18 0
aint do shit dumbass
18 1
aint do shit dumbass
18 2
aint do shit dumbass
18 3
aint do shit dumbass
18 4
aint do shit dumbass
18 5
aint do shit dumbass
18 6
aint do shit dumbass
18 7
aint do shit dumbass
18 8
aint do shit dumbass
18 9
aint do shit dumbass
18 10
aint do shit dumbass
18 11
aint do shit dumbass
18 12
aint do shit dumbass
18 13
aint do shit dumbass
18 14
aint do shit dumbass
18 15
aint do shit dumbass
18 16
aint do shit dumbass
18 17
aint do shit dumbass
18 18
aint do shit dumbass
18 19
aint do shit dumbass
18 20
aint do shit dumbass
18 21
aint do shit dumbass
18 22
aint do shit dumbass
18 23
aint do shit dumbass
18 24
aint do shit dumbass
18 25
aint do shit dumbass
18 26
aint do shit dumbass
18 27
aint do shit dumbass
18 28
aint do shit dumbass
18 29
aint do shit dumbass
18 30
aint do shit dumbass
18 31
aint do shit dumbass
18 32
aint do shit dumbass
18 33
aint do shit dumbass
18 34
aint do shit dumbass
18 35
aint do shit dumbass
18 36
aint do shi

In [8]:
# Ok now i can for this immediate task just look at gbx sessions, group by session dates
# How do I handle the only one per hour break? maybe per session keep a set that stores the hours it broke in, and everytime we detect a break we check that set, if its not we count 
# It as a break and add to the set, if it is part of the set already we just ignore it 

# How to store break data: dictionary for each market, each market has a dataframe where we add the row to that dataframe where it crossed 

markets_break_dirty = {"NQ" : pd.DataFrame(columns=markets["NQ"].columns), "ES" : pd.DataFrame(columns=markets["NQ"].columns), "YM" : pd.DataFrame(columns=markets["NQ"].columns), "RTY" : pd.DataFrame(columns=markets["NQ"].columns)}

def record_break(row, mark):
    
    # Only add to breaks if it hasn't been recorded for that hour
    if (row['long_stop'] > row['low']) or (row['short_stop'] < row['high']):
        # Record the break in the market's DataFrame
        new_row_df = pd.DataFrame([row])
        markets_break_dirty[mark] = pd.concat([markets_break_dirty[mark], new_row_df], ignore_index=True)

# Loop through all market DataFrames
for mark, df in markets.items():
    # Only select GBX sessions
    df = df[df['sessionType'] == 'GBX']
    
    # Set to track hours where breaks have been recorded
    #breaks_set = set()

    # Apply the function, passing the set to track breaks per session
    df.apply(record_break, axis=1, args=(mark,))


  markets_break_dirty[mark] = pd.concat([markets_break_dirty[mark], new_row_df], ignore_index=True)
  markets_break_dirty[mark] = pd.concat([markets_break_dirty[mark], new_row_df], ignore_index=True)
  markets_break_dirty[mark] = pd.concat([markets_break_dirty[mark], new_row_df], ignore_index=True)
  markets_break_dirty[mark] = pd.concat([markets_break_dirty[mark], new_row_df], ignore_index=True)


In [9]:
for mark, df in markets_break_dirty.items():
    df = df.groupby('sessionDate')
    print(f"{mark} had {len(df)} sessions of breaks")

NQ had 104 sessions of breaks
ES had 111 sessions of breaks
YM had 87 sessions of breaks
RTY had 102 sessions of breaks


In [10]:
for mark, df in markets.items():
    df = df.groupby('sessionDate')
    print(f"{mark} had {len(df)} sessions")

NQ had 258 sessions
ES had 258 sessions
YM had 258 sessions
RTY had 258 sessions
