In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import duckdb
%matplotlib inline
import plotly.express as px 

# os.chdir(r"C:\Users\siddu\Desktop\Decision Science Track\Revision")
print(os.getcwd())

In [None]:
# Upload filepaths
train_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\train_data_cols_sorted.parquet"
test_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\test_data_cols_sorted.parquet"

# Load files into dataframes
train_data_df = pd.read_parquet(train_data_fp)
test_data_df = pd.read_parquet(test_data_fp)

In [None]:
# Check for number of rows and columns
print(train_data_df.shape)
print(test_data_df.shape)

# Check for Missing Values

In [None]:
train_data_df.info()

In [None]:
import pandas as pd

# Assuming your DataFrame is loaded into a variable named 'train_data_df'
# Example (uncomment and use if you need to load it):
# train_data_df = pd.read_csv('your_file_name.csv')

# Find columns where all values are NaN
all_nan_cols = train_data_df.columns[train_data_df.isnull().all()]

# Get the count of such columns
num_all_nan_cols = len(all_nan_cols)

# Print the results
if num_all_nan_cols > 0:
    print(f"Found {num_all_nan_cols} columns with only NaN values.")
    print("These columns are:")
    
    # Print the list of column names
    for col_name in all_nan_cols:
        print(col_name)
else:
    print("No columns were found with only NaN values.")

In [None]:
# Drop columns with all missing values
train_data_df.drop(columns=all_nan_cols, inplace=True)
test_data_df.drop(columns=all_nan_cols, inplace=True)

In [None]:
import pandas as pd

def find_single_value_cols(df, df_name):
    """
    Finds columns in a DataFrame that have only one unique non-NaN value.
    """
    print(f"\n--- Checking {df_name} ---")
    
    # This list will hold the names of columns that match the criteria
    single_value_cols = []
    
    for col in df.columns:
        # .nunique() by default only counts non-null unique values.
        # So, if this is 1, it means the column has exactly one 
        # unique value plus any number of NaNs.
        if df[col].nunique() == 1:
            single_value_cols.append(col)
            
    if not single_value_cols:
        print("No columns found with only one unique value.")
    else:
        print(f"Found {len(single_value_cols)} columns with only one unique value:")
        print(single_value_cols)
        
        # Optional: Print the unique value for confirmation
        print("\nUnique values in these columns:")
        for col in single_value_cols:
            # .dropna() removes NaNs, .unique() finds the one value
            unique_val = df[col].dropna().unique()[0]
            print(f"  {col:<10} | {unique_val}")
            
    return single_value_cols

# --- Assuming train_data_df and test_data_df are loaded ---

# Find single-value columns in the training data
train_single_val = find_single_value_cols(train_data_df, 'train_data_df')

# Find single-value columns in the test data
#test_single_val = find_single_value_cols(test_data_df, 'test_data_df')

In [None]:
train_data_df[['f20']].value_counts()

In [None]:
# Drop columns with only single values or missing
train_data_df.drop(columns=train_single_val, inplace=True)
test_data_df.drop(columns=train_single_val, inplace=True)

In [None]:
print(train_single_val)

In [None]:
import pandas as pd
import numpy as np

# --- Configuration ---
# Your train_data_df is already loaded
# train_data_df = pd.read_parquet(...) 

output_excel_fp = 'high_missing_cols_report.xlsx'
# ---------------------

try:
    print("Analyzing 'train_data_df'...")
    
    # 1. Calculate Missing %
    print("Calculating missing value percentages...")
    total_rows = len(train_data_df)
    missing_percent = (train_data_df.isnull().sum() / total_rows)
    
    # 2. Filter Columns to find those > 95% missing
    high_missing_cols = missing_percent[missing_percent > 0.95].index
    
    if len(high_missing_cols) == 0:
        print("No columns found with more than 95% missing values.")
    else:
        print(f"Found {len(high_missing_cols)} columns with > 95% missing values.")
        
        # 3. Prepare 'y' for correlation
        if 'y' in train_data_df.columns:
            y_numeric = train_data_df['y'].astype(float)
            report_data = []
            
            # 4. Calculate Correlations
            print("Calculating correlations with 'y'...")
            for col in high_missing_cols:
                col_type = train_data_df[col].dtype
                correlation = np.nan 
                
                # --- THIS IS THE FIXED LINE ---
                # Changed from pd.api.types.is_categorical_dtype(col_type)
                if isinstance(col_type, pd.CategoricalDtype):
                # ------------------------------
                    col_numeric = train_data_df[col].cat.codes.replace(-1, np.nan)
                elif pd.api.types.is_numeric_dtype(col_type):
                    col_numeric = train_data_df[col].astype(float)
                else:
                    col_numeric = None
                
                if col_numeric is not None:
                    correlation = y_numeric.corr(col_numeric)
                
                # 5. Add data to our report list
                report_data.append({
                    'column_name': col,
                    'missing_percentage': missing_percent[col] * 100,
                    'correlation_with_y': correlation if col_numeric is not None else 'N/A (non-numeric)'
                })
            
            # 6. Create and Export the Excel Report
            report_df = pd.DataFrame(report_data)
            report_df = report_df.sort_values(by='missing_percentage', ascending=False)
            
            print(f"Exporting report to {output_excel_fp}...")
            report_df.to_excel(output_excel_fp, index=False)
            
            print("\n--- Report Summary ---")
            print(report_df.to_string())
            print(f"\n✅ Successfully created and exported '{output_excel_fp}'.")
            
        else:
            print("Error: 'y' column not found in the training data.")

except NameError:
    print("Error: 'train_data_df' is not loaded in memory.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
high_missing_cols

In [None]:
# Drop columns with only more than 95% missing values
train_data_df.drop(columns=high_missing_cols, inplace=True)
test_data_df.drop(columns=high_missing_cols, inplace=True)

# Exploratory Data Analysis (EDA)

In [None]:
# Find class distribution of y

train_data_df[['y']].value_counts()   # highly imbalanced dataset.

In [None]:
# Plot class distribution of y
x=train_data_df.drop(["y"],axis=1)
y=train_data_df["y"]

count_class = y.value_counts() # Count the occurrences of each class
plt.bar(count_class.index, count_class.values)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.xticks(count_class.index, ['Class 0', 'Class 1'])
plt.show()

In [None]:
# Hypothesis: Check the Account Creation Indicator with y

train_data_df[['f50']].value_counts()

In [None]:
import pandas as pd

# Assuming train_data_df is loaded

print("Checking condition: IF f50 == 'N' THEN y == 0")       # Some people even click for offers even if they don't have Amex account.

# 1. Filter the DataFrame to get only rows where f50 is 'N'
f50_is_N = train_data_df[train_data_df['f50'] == 'N']

# 2. Check the value counts of 'y' *within that filtered set*
y_counts_when_f50_is_N = f50_is_N['y'].value_counts()

print("\nValue counts of 'y' when 'f50' is 'N':")
print(y_counts_when_f50_is_N)

# 3. Report the result
# We check if the key '1' (or 1.0) exists in the value counts
if 1 in y_counts_when_f50_is_N.index or '1' in y_counts_when_f50_is_N.index:
    count = y_counts_when_f50_is_N.get(1, 0) or y_counts_when_f50_is_N.get('1', 0)
    print(f"\n❌ FAILED: Found {count} rows where f50 is 'N' but y is 1.")
else:
    print("\n✅ PASSED: All rows where f50 is 'N' have y as 0 (or missing).")

In [None]:
train_data_df[['f52']].value_counts()

In [None]:
import pandas as pd

# Assuming train_data_df is loaded

print("Checking condition: IF f52 == 'N' THEN y == 0")         # People click even if they are not active members

# 1. Filter the DataFrame to get only rows where f52 is 'N'
f52_is_N = train_data_df[train_data_df['f52'] == 'N']

# 2. Check the value counts of 'y' *within that filtered set*
y_counts_when_f52_is_N = f52_is_N['y'].value_counts()

print("\nValue counts of 'y' when 'f52' is 'N':")
print(y_counts_when_f52_is_N)

# 3. Report the result
# We check if the key '1' (or 1.0) exists in the value counts
if 1 in y_counts_when_f52_is_N.index or '1' in y_counts_when_f52_is_N.index:
    count = y_counts_when_f52_is_N.get(1, 0) or y_counts_when_f52_is_N.get('1', 0)
    print(f"\n❌ FAILED: Found {count} rows where f52 is 'N' but y is 1.")
else:
    print("\n✅ PASSED: All rows where f52 is 'N' have y as 0 (or missing).")

In [None]:
# Check if all non-account holders are not active.
# Assuming train_data_df is loaded

print("Checking condition: IF f50 == 'N' THEN y == 0")       # Some people even click for offers even if they don't have Amex account.

# 1. Filter the DataFrame to get only rows where f50 is 'N'
f50_is_N = train_data_df[train_data_df['f50'] == 'N']

# 2. Check the value counts of 'y' *within that filtered set*
y_counts_when_f50_is_N = f50_is_N['y'].value_counts()

print("\nValue counts of 'y' when 'f50' is 'N':")
print(y_counts_when_f50_is_N)

# 3. Report the result
# We check if the key '1' (or 1.0) exists in the value counts
if 1 in y_counts_when_f50_is_N.index or '1' in y_counts_when_f50_is_N.index:
    count = y_counts_when_f50_is_N.get(1, 0) or y_counts_when_f50_is_N.get('1', 0)
    print(f"\n❌ FAILED: Found {count} rows where f50 is 'N' but y is 1.")
else:
    print("\n✅ PASSED: All rows where f50 is 'N' have y as 0 (or missing).")

In [None]:
# Assuming train_data_df is loaded

# --- Check 1: f52 is never 'Y' whenever f50 is 'N' ---
print("Checking condition: IF f50 == 'N' THEN f52 != 'Y'")

# 1. Filter for rows where f50 is 'N'
f50_is_N = train_data_df[train_data_df['f50'] == 'N']

# 2. Check the value counts of 'f52' within that filtered set
f52_counts_when_f50_is_N = f50_is_N['f52'].value_counts()

print("\nValue counts of 'f52' when 'f50' is 'N':")
print(f52_counts_when_f50_is_N)

# 3. Report the result (Check for the violating condition, 'Y')
if 'Y' in f52_counts_when_f50_is_N.index:
    count = f52_counts_when_f50_is_N['Y']
    print(f"\n❌ FAILED: Found {count} rows where f50 is 'N' but f52 is 'Y'.")
else:
    print("\n✅ PASSED: All rows where f50 is 'N' have f52 as not 'Y'.")


print("\n" + "="*50 + "\n") # Separator


# --- Check 2: f50 is 'Y' whenever f52 is 'Y' ---
print("Checking condition: IF f52 == 'Y' THEN f50 == 'Y'")

# 1. Filter for rows where f52 is 'Y'
f52_is_Y = train_data_df[train_data_df['f52'] == 'Y']

# 2. Check the value counts of 'f50' within that filtered set
f50_counts_when_f52_is_Y = f52_is_Y['f50'].value_counts()

print("\nValue counts of 'f50' when 'f52' is 'Y':")
print(f50_counts_when_f52_is_Y)

# 3. Report the result (Check for the violating condition, 'N' or any non-'Y')
# We check if 'N' (the most likely alternative) exists
if 'N' in f50_counts_when_f52_is_Y.index:
    count = f50_counts_when_f52_is_Y['N']
    print(f"\n❌ FAILED: Found {count} rows where f52 is 'Y' but f50 is 'N'.")
# You could also check for any value *other* than 'Y'
elif len(f50_counts_when_f52_is_Y) > 1 or ('Y' not in f50_counts_when_f52_is_Y.index and len(f50_counts_when_f52_is_Y) > 0):
     print(f"\n❌ FAILED: Found rows where f52 is 'Y' but f50 is not 'Y'.")
else:
    print("\n✅ PASSED: All rows where f52 is 'Y' also have f50 as 'Y'.")

In [None]:
# Plot clicks and impressions in last 1 day

plt.figure(figsize=(18,8))
temp_df = train_data_df.sample(100000)
sns.scatterplot(x=temp_df.f320, y= temp_df.f315, alpha=0.8)
plt.title('Clicks vs Impression in last 1 day')
plt.show()

In [None]:
# Plot clicks and impressions in last 1 day

plt.figure(figsize=(18,8))
temp_df = train_data_df.sample(100000)
sns.scatterplot(x=temp_df.f315, y= temp_df.f320, alpha=0.8)
plt.title('Clicks vs Impression')
plt.show()

In [None]:
# Extract day, month, day of week, from date

train_data_df['id5'].dtype

In [None]:
train_data_df[['f349', 'f350']].head()

In [None]:
train_data_df[['f231','f237','f252','f269']].head()

In [None]:
train_data_df[['f78','f81']].head()

In [None]:
train_data_df.head()

# Strategy: Remove anything (any feature) which is more than 30 days old

In [None]:
cols_to_drop = ['f38', 'f68', 'f69', 'f71', 'f72', 'f73', 'f74', 'f75', 'f82', 'f83', 'f87', 'f90', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f121', 'f163', 'f164', 'f165', 'f166', 'f167', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f186', 'f198', 'f361', 'f362', 'f363']


train_data_df = train_data_df.drop(columns=cols_to_drop, errors='ignore')

# CRITICAL: Also drop from the official test set
test_data_df = test_data_df.drop(columns=cols_to_drop, errors='ignore')

print(train_data_df.shape)
print(test_data_df.shape)

In [None]:
# Drop columns related to the ratio of pages viewed variable.
cols_to_drop = ['f78', 'f81', 'f85', 'f89']

train_data_df = train_data_df.drop(columns=cols_to_drop, errors='ignore')

# CRITICAL: Also drop from the official test set
test_data_df = test_data_df.drop(columns=cols_to_drop, errors='ignore')

print(train_data_df.shape)
print(test_data_df.shape)

In [None]:
train_data_df[['id4']].head()
train_data_df['id4'].dtype

# Feature Engineering

In [None]:
# --- Verification Step ---
print("--- Verifying 'id4' in train_data_df ---")
# We check if any timestamp is different from its 'normalized' (midnight) version.
has_time_components_train = (train_data_df['id4'] != train_data_df['id4'].dt.normalize()).any()

if has_time_components_train:
    print("✅ PASSED: 'id4' in train_data_df contains time components (hours, minutes, seconds).")
    print("\nHour distribution (Train):")
    print(train_data_df['id4'].dt.hour.value_counts().sort_index())
else:
    print("⚠️ FAILED: 'id4' in train_data_df only contains dates (all times are 00:00:00).")

print("\n--- Verifying 'id4' in test_data_df ---")
has_time_components_test = (test_data_df['id4'] != test_data_df['id4'].dt.normalize()).any()

if has_time_components_test:
    print("✅ PASSED: 'id4' in test_data_df contains time components (hours, minutes, seconds).")
    print("\nHour distribution (Test):")
    #print(test_data_df['id4'].dt.hour.value_counts().sort_index())
else:
    print("⚠️ FAILED: 'id4' in test_data_df only contains dates (all times are 00:00:00).")

In [None]:
# --- Feature Extraction Step ---
# Assuming the verification above passed, or you want to extract features anyway

print("\n" + "="*50 + "\n")
print("--- Extracting all features from 'id4' ---")

try:
    # --- Train data ---
    print("Extracting features from train_data_df...")
    train_data_df['DayofMonth'] = train_data_df['id4'].dt.day
    train_data_df['Month'] = train_data_df['id4'].dt.month
    train_data_df['Year'] = train_data_df['id4'].dt.year
    train_data_df['DayofWeek'] = train_data_df['id4'].dt.dayofweek
    train_data_df['is_weekend'] = (train_data_df['id4'].dt.dayofweek >= 5).astype(int)
    train_data_df['WeekofYear'] = train_data_df['id4'].dt.isocalendar().week
    train_data_df['DayName'] = train_data_df['id4'].dt.day_name()
    train_data_df['Hour'] = train_data_df['id4'].dt.hour

    # --- Test data ---
    print("\nExtracting features from test_data_df...")
    test_data_df['DayofMonth'] = test_data_df['id4'].dt.day
    test_data_df['Month'] = test_data_df['id4'].dt.month
    test_data_df['Year'] = test_data_df['id4'].dt.year
    test_data_df['DayofWeek'] = test_data_df['id4'].dt.dayofweek
    test_data_df['is_weekend'] = (test_data_df['id4'].dt.dayofweek >= 5).astype(int)
    test_data_df['WeekofYear'] = test_data_df['id4'].dt.isocalendar().week
    test_data_df['DayName'] = test_data_df['id4'].dt.day_name()
    test_data_df['Hour'] = test_data_df['id4'].dt.hour

    print("\n✅ Feature extraction complete.")
    print("Example of new columns in train_data_df:")
    print(train_data_df[['id4', 'DayName', 'Hour', 'Minute', 'is_weekend']].head())

except AttributeError as e:
    print(f"\n❌ ERROR: 'id4' column is not in datetime format.")
    print(f"Please convert it first using:")
    print("train_data_df['id4'] = pd.to_datetime(train_data_df['id4'])")
    print("test_data_df['id4'] = pd.to_datetime(test_data_df['id4'])")
except KeyError as e:
    print(f"\n❌ ERROR: A column was not found. {e}")

In [None]:
#print('hour distribution in train')
#train_data_df['id4'].dt.hour.value_counts().sort_index()

# Hourly distribution for all observations
hour_counts = train_data_df['id4'].dt.hour.value_counts().sort_index()

# Hourly distribution where 'y' == 1
hour_y1_counts = train_data_df[train_data_df['y'] == 1]['id4'].dt.hour.value_counts().sort_index()

# Combine into a single DataFrame for comparison
hour_summary = pd.DataFrame({
    'total_count': hour_counts,
    'y1_count': hour_y1_counts
}).fillna(0).astype(int)

print(hour_summary)


In [None]:
train_data_df['y'].value_counts()

In [None]:
# Hourly distribution for all observations
hour_counts = train_data_df['Hour'].value_counts().sort_index()

# Hourly distribution where 'y' == 1
hour_y1_counts = train_data_df[train_data_df['y'] == 1]['Hour'].value_counts().sort_index()

# Combine into a single DataFrame
hour_summary = pd.DataFrame({
    'No. of obs in that hour': hour_counts,
    'Clicks': hour_y1_counts
}).fillna(0).astype(int)                  # Clicks

# Compute total number of y==1 for contribution percent
total_y1 = hour_summary['Clicks'].sum()

# Add required columns
hour_summary['Ratio of clicks per obs'] = hour_summary['Clicks'] / hour_summary['No. of obs in that hour']
hour_summary[f'% of clicks out of total clicks'] = (hour_summary['Clicks'] / total_y1) * 100

print(hour_summary)

hour_summary.to_excel('Clicks and hour.xlsx', index=True)


In [None]:
import numpy as np

# 1. Define the buckets based on your data analysis
# These lists hold the hours for each category we found
late_night_hours = [0, 1, 2, 3]
non_late_night_hours = [6, 8, 12, 17, 21, 22, 4, 7, 9, 10, 14, 16, 18, 19, 20, 5, 11, 13, 15, 23]

# 2. Create a function to apply these buckets
def assign_hour_bucket(hour):
    if hour in late_night_hours:
        return 'Late_Night'
    elif hour in non_late_night_hours:
        return 'Non_Late_Night'
    else:
        return 'Other_hrs' # As a fallback, though all hours (0-23) should be covered

# 3. Apply the function to create the new feature
print("Applying buckets to train and test data...")
train_data_df['Hour_Bucket'] = train_data_df['Hour'].apply(assign_hour_bucket)
test_data_df['Hour_Bucket'] = test_data_df['Hour'].apply(assign_hour_bucket)

print("Done.")

# 4. Check the result
print("\nNew 'Hour_Bucket' value counts in training data:")
print(train_data_df['Hour_Bucket'].value_counts())

print("\nCrosstab of Hour_Bucket vs. Clicks (y):")
print(pd.crosstab(train_data_df['Hour_Bucket'], train_data_df['y'], normalize='index'))

In [None]:
train_data_df[['DayofMonth', 'Month', 'Year', 'DayofWeek', 'is_weekend', 'WeekofYear', 'DayName', 'Hour', 'Hour_Bucket']].head()

In [None]:
test_data_df[['DayofMonth', 'Month', 'Year', 'DayofWeek', 'is_weekend', 'WeekofYear', 'DayName', 'Hour', 'Hour_Bucket']].head()

In [None]:
# # Create dummies for 'Hour_Bucket' and drop the first category ('Late_Night')
# # This leaves you with just one new column, e.g., 'Hour_Bucket_Non_Late_Night'
# train_data_df = pd.get_dummies(train_data_df, 
#                                columns=['Hour_Bucket'], 
#                                drop_first=True, 
#                                dtype=int)

# test_data_df = pd.get_dummies(test_data_df, 
#                               columns=['Hour_Bucket'], 
#                               drop_first=True, 
#                               dtype=int)

# # --- How to interpret the new column ---
# # If 'Hour_Bucket_Non_Late_Night' == 1, it was 'Non_Late_Night'
# # If 'Hour_Bucket_Non_Late_Night' == 0, it was 'Late_Night'

# print(train_data_df.head())

# 1. Create the new column based on your desired logic
# This creates a boolean (True/False) and .astype(int) converts it to (1/0)
train_data_df['Late_Night'] = (train_data_df['Hour_Bucket'] == 'Late_Night').astype(int)
test_data_df['Late_Night'] = (test_data_df['Hour_Bucket'] == 'Late_Night').astype(int)

# 2. Drop the original 'Hour_Bucket' column
train_data_df = train_data_df.drop('Hour_Bucket', axis=1)
test_data_df = test_data_df.drop('Hour_Bucket', axis=1)

# --- How to interpret the new column ---
# If 'Late_Night' == 1, it was 'Late_Night'
# If 'Late_Night' == 0, it was 'Non_Late_Night'

print(train_data_df.head())
print(train_data_df['Late_Night'].value_counts())

In [None]:
# Verify that train data is not on weekend
print(train_data_df[['WeekofYear']].value_counts())
print(test_data_df[['WeekofYear']].value_counts())

print(train_data_df[['is_weekend']].value_counts())
print(test_data_df[['is_weekend']].value_counts())

print(train_data_df[['Month']].value_counts())
print(test_data_df[['Month']].value_counts())

In [None]:
print(train_data_df[['Year']].value_counts())
print(test_data_df[['Year']].value_counts())

print(train_data_df[['DayofWeek']].value_counts())
print(test_data_df[['DayofWeek']].value_counts())

print(train_data_df[['DayName']].value_counts())
print(test_data_df[['DayName']].value_counts())

In [None]:
print(train_data_df[['DayName']].value_counts())
print(test_data_df[['DayName']].value_counts())

In [None]:
print(train_data_df.shape)
print(test_data_df.shape)
# Remove unnecessary columns (Just kept the DayofWeek to make the train and val dfs in the end)
date_cols_to_drop = ['DayofMonth', 'Month', 'Year', 'is_weekend', 'WeekofYear','DayName']
train_data_df = train_data_df.drop(columns=date_cols_to_drop, errors='ignore')
test_data_df = test_data_df.drop(columns=date_cols_to_drop, errors='ignore')
print(train_data_df.shape)
print(test_data_df.shape)

In [None]:
# Remove columns which are 90+ days older

date_cols_to_drop = ['f36']
train_data_df = train_data_df.drop(columns=date_cols_to_drop, errors='ignore')
test_data_df = test_data_df.drop(columns=date_cols_to_drop, errors='ignore')
print(train_data_df.shape)
print(test_data_df.shape)

In [None]:
train_data_df.head()

In [None]:
print(train_data_df.info())
print(test_data_df.info())

In [None]:
# --- Assuming your merged DataFrames are in memory ---

# Define your output filepaths
train_output_fp = 'train_df_datesort.parquet'
test_output_fp = 'test_df_datesort.parquet'

print(f"Saving training data to {train_output_fp}...")
# Use .to_parquet() to save
# index=False is important to avoid saving the pandas index as a separate column
train_data_df.to_parquet(train_output_fp, index=False)

print(f"Saving test data to {test_output_fp}...")
test_data_df.to_parquet(test_output_fp, index=False)


print("Save complete.")

# For last part

In [None]:
# --- Assuming train_data_df and test_data_df are loaded ---

# 1. Create the boolean masks for the split
# 'DayofWeek' 2 (Wed), 3 (Thu), 4 (Fri)
train_indices = train_data_df['DayofWeek'].isin([2, 3])
val_indices = train_data_df['DayofWeek'] == 4

# 2. Create new train_df and val_df (with 'y' included)
# We use .copy() to avoid a SettingWithCopyWarning
train_df = train_data_df.loc[train_indices].copy()
val_df = train_data_df.loc[val_indices].copy()

print(f"New Training Set (train_df) size: {len(train_df)}")
print(f"Validation Set (val_df) size: {len(val_df)}")


# 3. Define the list of all date-related columns to drop
# (I've corrected the missing comma in your list)
date_cols_to_drop = ['DayofWeek']

# 4. Drop these columns from all three sets
print("\nDropping date-related columns from all sets...")

# Using errors='ignore' is robust in case 'DayName' wasn't created
train_df = train_df.drop(columns=date_cols_to_drop, errors='ignore')
val_df = val_df.drop(columns=date_cols_to_drop, errors='ignore')

# CRITICAL: Also drop from the official test set
test_data_df = test_data_df.drop(columns=date_cols_to_drop, errors='ignore')
train__data_df = train_data_df.drop(columns=date_cols_to_drop, errors='ignore')

print("Date columns dropped.")

# 5. Verification (you can check the columns)
print("\n--- Verification ---")
print(f"'y' column in train_df: {'y' in train_data_df.columns}")
print(f"'y' column in train_df: {'y' in train_df.columns}")
print(f"'y' column in val_df: {'y' in val_df.columns}")
print(f"'DayofWeek' column in train_df: {'DayofWeek' in train_df.columns}")

In [None]:
train_df.head()

In [None]:
val_df.head()

In [None]:
test_data_df.head()

In [None]:
print(train_df.shape)
print(val_df.shape)
print(test_data_df.shape)

In [None]:
# Check for extra cols

# Get columns present in all datasets
train_df_cols = train_df.columns
val_df_cols = val_df.columns
test_data_df_cols = test_data_df.columns

# Find different in train and test data
print("Different columns in train and test data:")
for col in train_df_cols.difference(test_data_df_cols):
    print(col)

# Only 'y'-'clicked' was not present in the test data.

print("Different columns in val and test data:")
for col in val_df_cols.difference(test_data_df_cols):
    print(col)

In [None]:
# --- Assuming your merged DataFrames are in memory ---

# Define your output filepaths
train_output_fp = 'train_df.parquet'
val_output_fp = 'val_df.parquet'
test_output_fp = 'test_df.parquet'
train_and_val_fp = 'train_and_val_df.parquet'

print(f"Saving training data to {train_output_fp}...")
# Use .to_parquet() to save
# index=False is important to avoid saving the pandas index as a separate column
train_df.to_parquet(train_output_fp, index=False)

print(f"Saving val data to {val_output_fp}...")
val_df.to_parquet(val_output_fp, index=False)

print(f"Saving test data to {test_output_fp}...")
test_data_df.to_parquet(test_output_fp, index=False)

print(f"Saving test data to {train_and_val_fp}...")
train_data_df.to_parquet(train_and_val_fp, index=False)

print("Save complete.")

In [None]:
print(train_data_df.shape)
print(train_df.shape)