In [1]:
import pandas as pd

# Look at features metadata
features_df = pd.read_csv('features.csv')
num_feature_tags = sum(1 for col in features_df.columns if col.startswith('tag_'))
print(f"Features metadata (True/False values across {num_feature_tags} tags):")
print(features_df.head())

# Look at responders metadata
responders_df = pd.read_csv('responders.csv')
num_responder_tags = sum(1 for col in responders_df.columns if col.startswith('tag_'))
print(f"\nResponders metadata (True/False values across {num_responder_tags} tags):")
print(responders_df.head())

# Count number of True values for each feature
tag_counts_per_feature = features_df.iloc[:, 1:].sum(axis=1)  # Skip 'feature' column

# Create summary
print("\nDistribution of True/False values per feature:")
value_counts = tag_counts_per_feature.value_counts().sort_index()
for true_count, num_features in value_counts.items():
    print(f"- {num_features} features have {true_count} True and {num_feature_tags-true_count} False")

print(f"\nEach feature has {num_feature_tags} tags total")
print(f"Min True values in a feature: {tag_counts_per_feature.min()} (with {num_feature_tags-tag_counts_per_feature.min()} False)")
print(f"Max True values in a feature: {tag_counts_per_feature.max()} (with {num_feature_tags-tag_counts_per_feature.max()} False)")
print(f"Mean True values per feature: {tag_counts_per_feature.mean():.2f}")

# Show first few features with their True/False counts
results = pd.DataFrame({
    'feature': features_df['feature'],
    'True_count': tag_counts_per_feature,
    'False_count': num_feature_tags - tag_counts_per_feature
})
print("\nFirst 10 features and their True/False counts:")
print(results.head(10))

Features metadata (True/False values across 17 tags):
      feature  tag_0  tag_1  tag_2  tag_3  tag_4  tag_5  tag_6  tag_7  tag_8  \
0  feature_00  False  False   True  False  False  False  False  False  False   
1  feature_01  False  False   True  False  False  False  False  False  False   
2  feature_02  False  False   True  False  False  False  False  False  False   
3  feature_03  False  False   True  False  False  False  False  False  False   
4  feature_04  False  False   True  False  False  False  False  False  False   

   tag_9  tag_10  tag_11  tag_12  tag_13  tag_14  tag_15  tag_16  
0  False   False   False   False   False    True   False    True  
1  False   False   False   False    True    True   False    True  
2  False   False   False    True   False   False   False    True  
3  False   False   False   False    True   False   False    True  
4  False   False   False    True    True   False   False    True  

Responders metadata (True/False values across 5 tags):
     re

In [None]:
import pandas as pd
import numpy as np

def analyze_metadata():
    # Read metadata
    features_df = pd.read_csv('features.csv')
    responders_df = pd.read_csv('responders.csv')
    
    # Analyze feature tags
    feature_tag_counts = features_df.iloc[:, 1:].sum()  # Skip 'feature' column
    print("Feature tag counts:")
    print(feature_tag_counts)
    print(f"\nTotal features: {len(features_df)}")
    
    # Analyze responder tags
    responder_tag_counts = responders_df.iloc[:, 1:].sum()  # Skip 'responder' column
    print("\nResponder tag counts:")
    print(responder_tag_counts)
    print(f"\nTotal responders: {len(responders_df)}")
    
    # See which features share same tag patterns
    feature_patterns = features_df.iloc[:, 1:].apply(tuple, axis=1)
    pattern_counts = feature_patterns.value_counts()
    print("\nCommon feature tag patterns:")
    print(pattern_counts.head())
    
    return features_df, responders_df

# Run analysis
features_df, responders_df = analyze_metadata()