## Load and Analyze CSV

In [None]:
import pandas as pd
import numpy as np

# Path to features.csv
features_file = "/path/to/features.csv"

# Load features.csv
features_df = pd.read_csv(features_file)

# Analyze tags
tags = [col for col in features_df.columns if "tag" in col]
features_by_tag = {tag: features_df[features_df[tag] == True]["feature"].tolist() for tag in tags}

# Summary of features by tag
for tag, features in features_by_tag.items():
    print(f"{tag}: {len(features)} features")

# Save the features grouped by tags for further exploration
features_by_tag_df = pd.DataFrame(dict([(tag, pd.Series(features)) for tag, features in features_by_tag.items()]))
features_by_tag_df.to_csv("/path/to/features_by_tag.csv", index=False)

## Load and Sample Data from Partitions

In [None]:
import os

# Path to training data directory
data_dir = "/path/to/train.parquet"
partitions = [f"partition_id={i}" for i in range(10)]

# Initialize a DataFrame to collect random samples
sampled_data = pd.DataFrame()

# Randomly sample rows from each partition
for partition in partitions:
    partition_path = os.path.join(data_dir, partition)
    print(f"Processing {partition}...")
    df = pd.read_parquet(partition_path, engine="pyarrow")
    sampled_partition = df.sample(n=500, random_state=42)  # Adjust `n` based on memory
    sampled_data = pd.concat([sampled_data, sampled_partition], ignore_index=True)

print(f"Sampled data shape: {sampled_data.shape}")

##  Analyze and Filter Features

In [None]:
# Define target variable
target_col = "responder_6"

# Drop columns with >50% missing values
nan_threshold = 0.5
valid_features = sampled_data.columns[sampled_data.isna().mean() < nan_threshold]
filtered_data = sampled_data[valid_features]

# Calculate correlations with the target
correlations = filtered_data.corr()[target_col].drop(target_col).sort_values(ascending=False)

# Identify top positively and negatively correlated features
top_positive_features = correlations.head(10).index.tolist()
top_negative_features = correlations.tail(10).index.tolist()

print("Top positively correlated features:")
print(correlations.head(10))

print("\nTop negatively correlated features:")
print(correlations.tail(10))

# Combine top features for final selection
final_features = top_positive_features + top_negative_features + [target_col]

# Filter data to only include selected features
filtered_data = filtered_data[final_features]

# Drop rows with any NaN values
filtered_data.dropna(inplace=True)

print(f"Filtered data shape: {filtered_data.shape}")

## 