In [3]:
import pandas as pd
import numpy as np

In [27]:
import pandas as pd

input_path = r"D:\Sparkathon\Data\phase1_Nov_cleaned_ecommerce_data.csv"
output_path = r"D:\Sparkathon\Data\phase2_Nov_session_features.csv"

# We'll accumulate rows into a temp CSV for memory efficiency
temp_chunk_csv = r"D:\Sparkathon\Data\temp_chunked_nov_data.csv"


In [29]:
# Clear temp file if it exists
with open(temp_chunk_csv, 'w') as f:
    f.write('')  # empty file

chunks = pd.read_csv(
    input_path,
    chunksize=200_000,
    on_bad_lines='skip',
    encoding='utf-8',
    engine='python'
)

first_chunk = True

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}...")

    # Step 2.1: Optimize types
    chunk['event_type'] = chunk['event_type'].astype('category')
    chunk['brand'] = chunk['brand'].astype('category')
    chunk['main_category'] = chunk['main_category'].astype('category')

    # Step 2.2: Create interaction flags
    chunk['is_view'] = (chunk['event_type'] == 'view').astype('int8')
    chunk['is_cart'] = (chunk['event_type'] == 'cart').astype('int8')
    chunk['is_remove'] = (chunk['event_type'] == 'remove_from_cart').astype('int8')

    # Step 2.3: Save cleaned chunk to temp file (append mode)
    chunk.to_csv(temp_chunk_csv, mode='a', index=False, header=first_chunk)
    first_chunk = False


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Processing chunk 42...
Processing chunk 43...
Processing chunk 44.

In [33]:
import pandas as pd
from tqdm import tqdm

input_file = r"D:\Sparkathon\Data\temp_chunked_nov_data.csv"
session_features_list = []
session_labels_dict = {}

# If event_time is not parsed yet, parse it inside the loop
chunks = pd.read_csv(input_file, chunksize=200_000, engine='python', parse_dates=['event_time'])

for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
    # Step 1: Set categorical and flags
    chunk['event_type'] = chunk['event_type'].astype('category')
    chunk['brand'] = chunk['brand'].astype('category')
    chunk['main_category'] = chunk['main_category'].astype('category')

    chunk['is_view'] = (chunk['event_type'] == 'view').astype('int8')
    chunk['is_cart'] = (chunk['event_type'] == 'cart').astype('int8')
    chunk['is_remove'] = (chunk['event_type'] == 'remove_from_cart').astype('int8')

    # Step 2: Label sessions with purchase
    labels = chunk.groupby('user_session')['event_type'].apply(lambda x: 1 if 'purchase' in x.values else 0)
    session_labels_dict.update(labels.to_dict())

    # Step 3: Aggregation for session-level features
    session_agg = chunk.groupby('user_session').agg(
        user_id=('user_id', 'first'),
        session_start=('event_time', 'min'),
        session_end=('event_time', 'max'),
        num_events=('event_type', 'count'),
        num_views=('is_view', 'sum'),
        num_carts=('is_cart', 'sum'),
        num_remove_from_cart=('is_remove', 'sum'),
        num_unique_products=('product_id', 'nunique'),
        num_unique_categories=('category_id', 'nunique'),
        avg_price=('price', 'mean'),
        max_price=('price', 'max'),
        min_price=('price', 'min'),
        num_brands=('brand', 'nunique'),
    ).reset_index()

    session_features_list.append(session_agg)

# Combine all chunked session features
print("Concatenating all session features...")
session_features = pd.concat(session_features_list, ignore_index=True)

# Feature engineering for time
session_features['session_start'] = pd.to_datetime(session_features['session_start'])
session_features['session_end'] = pd.to_datetime(session_features['session_end'])
session_features['session_duration'] = (session_features['session_end'] - session_features['session_start']).dt.total_seconds()
session_features['hour_of_day'] = session_features['session_start'].dt.hour

# Convert label dict to Series
session_labels = pd.Series(session_labels_dict, name='purchase_label')

# Merge labels
print("Merging with labels...")
final_df = session_features.merge(session_labels, how='left', left_on='user_session', right_index=True)

# Clean up
final_df.dropna(subset=['purchase_label'], inplace=True)
final_df.reset_index(drop=True, inplace=True)

# Save final output
output_path = r"D:\Sparkathon\Data\phase2_Nov_session_features.csv"
final_df.to_csv(output_path, index=False)
print(f"Final session-level features for Nov saved to: {output_path}")


Processing chunks: 338it [32:37,  5.79s/it]


Concatenating all session features...
Merging with labels...
Final session-level features for Nov saved to: D:\Sparkathon\Data\phase2_Nov_session_features.csv
