![QuantConnect Logo](https://cdn.quantconnect.com/web/i/icon.png)
<hr>

In [20]:
from QuantConnect import *
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm

# Initialize QuantBook
qb = QuantBook()

# Set the start and end dates
start_date = datetime(2023, 1, 1)
end_date = datetime.now()

# Add E-mini S&P 500 futures continuous contract
emini = qb.AddFuture(Futures.Indices.SP500EMini, Resolution.Minute, Market.CME)

def download_chunk(start, end):
    # Get the list of contracts for this period
    symbols = qb.FutureChainProvider.GetFutureContractList(emini.Symbol, start)
    
    all_data = []
    for symbol in symbols:
        history = qb.History(TradeBar, symbol, start, end, Resolution.Minute)
        if not history.empty:
            df = history.loc[:, ['open', 'close', 'volume']]  # Added 'open' here
            all_data.append(df)
    
    if not all_data:
        print(f"No data returned for period {start} to {end}")
        return pd.DataFrame()
    
    return pd.concat(all_data)

def basic_quality_check(df):
    if df.empty:
        print("No data to check.")
        return
    # Check for missing values
    missing_values = df.isnull().sum()
    
    # Check for negative or zero prices
    invalid_prices = ((df['open'] <= 0) | (df['close'] <= 0)).sum()  # Updated to check both open and close
    
    # Check for negative volumes
    invalid_volumes = (df['volume'] < 0).sum()
    
    if missing_values.sum() > 0 or invalid_prices > 0 or invalid_volumes > 0:
        print(f"Data quality issues found:")
        print(f"Missing values: {missing_values}")
        print(f"Invalid prices: {invalid_prices}")
        print(f"Invalid volumes: {invalid_volumes}")
    else:
        print("No data quality issues found.")

# Initialize an empty dataframe to store all data
all_data = pd.DataFrame()

# Calculate number of chunks
chunk_delta = timedelta(days=180)  # 6 months
num_chunks = int((end_date - start_date) / chunk_delta) + 1

# Download data in chunks
for i in tqdm(range(num_chunks), desc="Downloading data"):
    chunk_start = start_date + i * chunk_delta
    chunk_end = min(chunk_start + chunk_delta, end_date)
    
    print(f"Downloading chunk {i+1}/{num_chunks}: {chunk_start} to {chunk_end}")
    
    chunk_data = download_chunk(chunk_start, chunk_end)
    all_data = pd.concat([all_data, chunk_data])
    
    print(f"Chunk {i+1} downloaded. Performing quality check:")
    basic_quality_check(chunk_data)

# Final processing
all_data = all_data.sort_index()

print(f"Data acquisition complete.")
print(f"Data shape: {all_data.shape}")
print(f"Date range: {all_data.index[0]} to {all_data.index[-1]}")
print(f"Column names: {all_data.columns.tolist()}")
print(f"Index names: {all_data.index.names}")
print("\nFirst few rows:")
print(all_data.head())
print("\nLast few rows:")
print(all_data.tail())

# Analyze trading hours
def analyze_trading_hours(df):
    if df.empty:
        print("No data to analyze trading hours.")
        return pd.DataFrame()
    df['hour'] = df.index.get_level_values('time').hour
    df['minute'] = df.index.get_level_values('time').minute
    
    trading_hours = df.groupby(['hour', 'minute']).size().reset_index()
    trading_hours.columns = ['hour', 'minute', 'count']
    trading_hours = trading_hours[trading_hours['count'] > 0]
    
    if not trading_hours.empty:
        start_time = trading_hours.iloc[0]
        end_time = trading_hours.iloc[-1]
        print(f"Trading seems to start at {start_time['hour']:02d}:{start_time['minute']:02d}")
        print(f"Trading seems to end at {end_time['hour']:02d}:{end_time['minute']:02d}")
    else:
        print("Unable to determine trading hours.")
    
    return trading_hours

trading_hours = analyze_trading_hours(all_data)
print(trading_hours)

# Print unique symbols
unique_symbols = all_data.index.get_level_values('symbol').unique()
print("\nUnique symbols:")
print(unique_symbols)

In [21]:
print(all_data.index.names)
print(all_data.columns)
print(all_data.head())
print("\nUnique symbols:")
print(all_data.index.get_level_values('symbol').unique())

In [22]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

def rollGaps(df):
    df = df.sort_index()
    contracts = df.index.get_level_values('symbol').unique()
    roll_dates = []
    gaps = []

    for i in range(len(contracts) - 1):
        current_contract = contracts[i]
        next_contract = contracts[i+1]
        
        current_data = df.xs(current_contract, level='symbol', drop_level=False)
        next_data = df.xs(next_contract, level='symbol', drop_level=False)
        
        overlap = current_data.index.get_level_values('time').intersection(next_data.index.get_level_values('time'))
        
        if len(overlap) > 0:
            roll_date = overlap[-1]  # Use the last overlapping date as the roll date
            current_close = current_data.loc[current_data.index.get_level_values('time') == roll_date, 'close'].iloc[-1]
            next_open = next_data.loc[next_data.index.get_level_values('time') == roll_date, 'open'].iloc[0]
            gap = next_open - current_close
            roll_dates.append(roll_date)
            gaps.append(gap)

    return pd.Series(np.cumsum(gaps), index=roll_dates)

def create_continuous_contract(df):
    df = df.sort_index()
    gaps = rollGaps(df)
    
    continuous_data = []
    contracts = df.index.get_level_values('symbol').unique()
    current_contract = contracts[0]
    adjustment = 0

    for date in tqdm(df.index.get_level_values('time').unique(), desc="Creating continuous contract"):
        if date in gaps.index:
            adjustment += gaps[date]
            current_contract = contracts[np.where(contracts == current_contract)[0][0] + 1]
        
        try:
            day_data = df.xs((slice(None), current_contract, date), level=('expiry', 'symbol', 'time')).iloc[0]
            
            continuous_data.append({
                'date': date,
                'open': day_data['open'] - adjustment,
                'close': day_data['close'] - adjustment,
                'volume': day_data['volume']
            })
        except IndexError:
            print(f"No data found for date: {date}, contract: {current_contract}")
            continue

    return pd.DataFrame(continuous_data).set_index('date')

# Create continuous contract
print("Creating continuous contract...")
continuous_contract = create_continuous_contract(all_data)

# Calculate returns
continuous_contract['returns'] = continuous_contract['close'].pct_change()

# Display results
print("\nContinuous contract - first few rows:")
print(continuous_contract.head())
print("\nContinuous contract - last few rows:")
print(continuous_contract.tail())


In [24]:

print("\nContinuous contract creation complete.")
print(f"Continuous contract shape: {continuous_contract.shape}")
print(f"Date range: {continuous_contract.index[0]} to {continuous_contract.index[-1]}")


In [27]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert index to datetime if it's not already
continuous_contract.index = pd.to_datetime(continuous_contract.index)

# Sort the dataframe by date
continuous_contract = continuous_contract.sort_index()

# Convert 'close' to numeric, coercing errors to NaN
continuous_contract['close'] = pd.to_numeric(continuous_contract['close'], errors='coerce')

# Drop any NaN values
continuous_contract = continuous_contract.dropna(subset=['close'])

plt.figure(figsize=(15, 10))
plt.plot(continuous_contract.index, continuous_contract['close'])
plt.title('E-mini S&P 500 Continuous Contract')
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()


In [28]:
# Convert the 'time' index level to datetime if it's not already
all_data.index = all_data.index.set_levels(pd.to_datetime(all_data.index.get_level_values('time')), level='time')

# Get the start date and end date for the first three days
start_date = all_data.index.get_level_values('time').min()
end_date = start_date + pd.Timedelta(days=3)

# Filter the data for the first three days
first_three_days = all_data.loc[all_data.index.get_level_values('time').between(start_date, end_date)]

# Group by symbol and date, and check for price changes
def check_price_changes(group):
    return pd.Series({
        'open_changed': group['open'].nunique() > 1,
        'close_changed': group['close'].nunique() > 1,
        'volume_sum': group['volume'].sum()
    })

price_change_summary = first_three_days.groupby(['symbol', first_three_days.index.get_level_values('time').date]).apply(check_price_changes)

print("Summary of price changes and volume for each contract and day:")
print(price_change_summary)

print("\nDetailed view of the first 50 rows:")
print(first_three_days.head(50))

print("\nUnique values in 'open' and 'close' columns for each contract:")
for symbol in first_three_days.index.get_level_values('symbol').unique():
    contract_data = first_three_days.xs(symbol, level='symbol')
    print(f"\nContract: {symbol}")
    print(f"Unique 'open' values: {contract_data['open'].unique()}")
    print(f"Unique 'close' values: {contract_data['close'].unique()}")

print("\nTotal number of rows:", len(first_three_days))
print("Number of rows with non-zero volume:", (first_three_days['volume'] > 0).sum())