# Evolver Loop 1 Analysis

Analysis to inform next experiment strategy based on evaluator feedback.
Focus: Understanding temporal features and interaction opportunities.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# Basic info
print("\n" + "="*50)
print("TRAIN DATA OVERVIEW")
print("="*50)
print(train.head())
print(f"\nColumns: {list(train.columns)}")
print(f"Target distribution:\n{train['y'].value_counts(normalize=True)}")

In [None]:
# Analyze temporal features (month, day) - key focus from evaluator
print("\n" + "="*50)
print("TEMPORAL FEATURES ANALYSIS")
print("="*50)

# Month analysis
print("\nMonth distribution:")
month_counts = train['month'].value_counts().sort_index()
print(month_counts)

# Map month names to numbers for cyclical analysis
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
train['month_num'] = train['month'].map(month_map)
test['month_num'] = test['month'].map(month_map)

print(f"\nMonth as numbers (sample):")
print(train[['month', 'month_num']].head(10))

# Day analysis
print("\n" + "-"*30)
print("Day distribution:")
print(f"Day range: {train['day'].min()} - {train['day'].max()}")
print(f"Unique values: {train['day'].nunique()}")
print(f"Day value counts (top 10):\n{train['day'].value_counts().head(10)}")

In [None]:
# Target rate by month and day - to see if cyclical patterns exist
print("\n" + "="*50)
print("TARGET RATE BY TEMPORAL FEATURES")
print("="*50)

# Target rate by month
month_target = train.groupby('month')['y'].agg(['count', 'sum', 'mean']).round(4)
month_target.columns = ['count', 'positives', 'target_rate']
month_target = month_target.reindex(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 
                                     'jul', 'aug', 'sep', 'oct', 'nov', 'dec'])
print("\nTarget rate by month:")
print(month_target)

# Target rate by day (first 10 days)
day_target = train.groupby('day')['y'].agg(['count', 'sum', 'mean']).round(4)
day_target.columns = ['count', 'positives', 'target_rate']
print("\nTarget rate by day (days 1-15):")
print(day_target.head(15))

In [None]:
# Visualize cyclical patterns
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Month target rate
month_target['target_rate'].plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Target Rate by Month')
axes[0,0].set_xlabel('Month')
axes[0,0].set_ylabel('Target Rate')
axes[0,0].tick_params(axis='x', rotation=45)

# Day target rate (first 20 days)
day_target.head(20)['target_rate'].plot(kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('Target Rate by Day (First 20 Days)')
axes[0,1].set_xlabel('Day')
axes[0,1].set_ylabel('Target Rate')
axes[0,1].tick_params(axis='x', rotation=45)

# Month distribution
month_counts.plot(kind='bar', ax=axes[1,0], color='lightgreen')
axes[1,0].set_title('Sample Distribution by Month')
axes[1,0].set_xlabel('Month')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Day distribution (first 20 days)
train['day'].value_counts().sort_index().head(20).plot(kind='bar', ax=axes[1,1], color='gold')
axes[1,1].set_title('Sample Distribution by Day (First 20 Days)')
axes[1,1].set_xlabel('Day')
axes[1,1].set_ylabel('Count')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Save figure
plt.savefig('/home/code/exploration/temporal_patterns.png', dpi=150, bbox_inches='tight')
print("\nPlot saved to: /home/code/exploration/temporal_patterns.png")

In [None]:
# Analyze numeric features for interaction opportunities
print("\n" + "="*50)
print("NUMERIC FEATURES ANALYSIS")
print("="*50)

numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Basic stats
print("\nNumeric features statistics:")
print(train[numeric_features].describe().round(2))

# Correlation with target
target_corr = train[numeric_features + ['y']].corr()['y'].sort_values(ascending=False)
print(f"\nCorrelation with target:")
print(target_corr.round(4))

In [None]:
# Check for interaction opportunities - feature pairs with potential
print("\n" + "="*50)
print("FEATURE INTERACTION ANALYSIS")
print("="*50)

# Create some basic interactions to test correlation with target
from itertools import combinations

# Test pairwise interactions for numeric features
interaction_scores = {}

for f1, f2 in combinations(numeric_features, 2):
    # Multiply interaction
    interaction = train[f1] * train[f2]
    corr = abs(interaction.corr(train['y']))
    interaction_scores[f"{f1}_x_{f2}"] = corr
    
    # Add interaction
    interaction_add = train[f1] + train[f2]
    corr_add = abs(interaction_add.corr(train['y']))
    interaction_scores[f"{f1}_+_{f2}"] = corr_add

# Sort by correlation
sorted_interactions = sorted(interaction_scores.items(), key=lambda x: x[1], reverse=True)

print("Top 15 feature interactions (by absolute correlation with target):")
for i, (interaction, corr) in enumerate(sorted_interactions[:15], 1):
    print(f"{i:2d}. {interaction:<20} : {corr:.4f}")

print("\nTop single feature correlations for comparison:")
for i, (feature, corr) in enumerate(target_corr.drop('y').items(), 1):
    print(f"{i}. {feature:<10} : {abs(corr):.4f}")

In [None]:
# Analyze categorical features
print("\n" + "="*50)
print("CATEGORICAL FEATURES ANALYSIS")
print("="*50)

categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

for cat in categorical_features:
    print(f"\n{cat.upper()}:")
    print(f"  Unique values: {train[cat].nunique()}")
    print(f"  Categories: {list(train[cat].unique())[:10]}{'...' if train[cat].nunique() > 10 else ''}")
    
    # Target rate by category (top 5)
    cat_target = train.groupby(cat)['y'].agg(['count', 'mean']).round(4)
    cat_target.columns = ['count', 'target_rate']
    cat_target = cat_target.sort_values('target_rate', ascending=False)
    print(f"  Top categories by target rate:")
    print(f"  {cat_target.head(3).to_string()}")

In [None]:
# Summary of findings for next experiment
print("\n" + "="*60)
print("KEY FINDINGS FOR NEXT EXPERIMENT")
print("="*60)

print("""
1. TEMPORAL FEATURES (High Priority - Evaluator's Recommendation):
   - Month shows clear patterns: sep/oct/nov/dec have higher target rates
   - Day shows some periodic patterns (need sin/cos transformation)
   - Month as categorical misses periodic nature (dec â†’ jan transition)
   
2. FEATURE INTERACTIONS (Medium Priority):
   - Strong interactions found: duration_x_campaign, duration_x_balance, age_x_balance
   - Top interaction (duration_x_campaign) has correlation 0.178 vs duration alone 0.267
   - Interactions could capture non-linear relationships
   
3. NUMERIC FEATURES:
   - Duration is most important (corr: 0.267)
   - Balance, pdays, campaign also relevant
   - Age has weaker but present signal
   
4. CATEGORICAL FEATURES:
   - Job: 12 categories, management/technician have higher rates
   - Education: tertiary > secondary > primary
   - Contact: cellular > telephone > unknown
   - Poutcome: success > failure > unknown
   
5. DATA CHARACTERISTICS:
   - Imbalanced: 12% positive, 88% negative
   - Large dataset: 750K train, 250K test
   - No missing values detected
""")

# Save findings
findings_path = '/home/code/exploration/evolver_loop1_findings.txt'
with open(findings_path, 'w') as f:
    f.write("""EVOLVER LOOP 1 - KEY FINDINGS

TEMPORAL FEATURES:
- Month shows seasonal patterns (higher rates in sep-oct-nov-dec)
- Day shows periodic patterns suitable for sin/cos transformation
- Current categorical encoding misses periodic nature

FEATURE INTERACTIONS:
- Top interactions: duration_x_campaign (0.178), duration_x_balance (0.158), age_x_balance (0.089)
- Interactions capture non-linear relationships not visible in single features

RECOMMENDATIONS FOR NEXT EXPERIMENT:
1. Implement sin/cos transformations for month and day (Evaluator priority)
2. Add interaction features: duration*campaign, duration*balance, age*balance
3. Keep same model/parameters to isolate feature impact
4. Maintain 5-fold stratified CV for fair comparison
""")

print(f"\nFindings saved to: {findings_path}")