<a href="https://colab.research.google.com/github/VinKKAP/Data-Analysis-with-LLM/blob/main/Experiment_Run1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
!git clone https://github.com/VinKKAP/Data-Analysis-with-LLM.git

Cloning into 'Data-Analysis-with-LLM'...
remote: Enumerating objects: 433, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 433 (delta 11), reused 44 (delta 5), pack-reused 375 (from 1)[K
Receiving objects: 100% (433/433), 57.98 MiB | 11.56 MiB/s, done.
Resolving deltas: 100% (195/195), done.


In [3]:
!pip install -r /content/Data-Analysis-with-LLM/experiment/requirements.txt

Collecting datasets (from -r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting simpletransformers (from -r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 6))
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->-r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading multiprocess

In [None]:
import torch
print(torch.cuda.is_available())

False


In [4]:
'''
Correlation Prediction Script
This script predicts correlations between column names using a DistilBERT model and Theil's U values.
The code is organized into several main sections:

1. Data Preprocessing
   - Loading and filtering data
   - Adding column type information
   - Labeling correlations (binary classification)

2. Train/Test Split Functions
   - Default split (def_split)
   - Dataset-aware split (ds_split)

3. Prediction Methods
   - Baseline using Jaccard similarity
   - DistilBERT model predictions

4. Evaluation and Metrics
   - Performance metrics calculation
   - Results logging
   - Wandb integration for experiment tracking

5. Main Pipeline
   - Data loading and preprocessing
   - Model training
   - Evaluation across different data subsets
'''

from multiprocessing import set_start_method
try:
    set_start_method('spawn')
except RuntimeError:
    pass

import sklearn.metrics as metrics
import pandas as pd
import random as rand
import time

from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.stats import chi2_contingency

from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
userdata.get('HF_TOKEN')
import wandb
wandb.init(project="Distilbert_run23") # Initialize wandb run




# ---- Data Preprocessing Functions ----

def add_type(row):
    """ Enrich column name by adding column type.

    Args:
        row: describes correlation between two columns.

    Returns:
        row with enriched column names.
    """
    row['column1'] = row['column1'] + ' ' + row['type1']
    row['column2'] = row['column2'] + ' ' + row['type2']
    return row


# ---- Train/Test Split Functions ----

def def_split(data, test_ratio, seed):
    """ Split data into training and test set.

    With this approach, different column pairs from the
    same data set may appear in training and test set.

    Args:
        data: a pandas dataframe containing all data.
        test_ratio: ratio of test cases after split.
        seed: random seed for deterministic results.

    Returns:
        a tuple containing training, then test data.
    """
    print('Data sets in training and test may overlap')
    x_train, x_test, y_train, y_test = train_test_split(
      data[['column1', 'column2', 'type1', 'type2']], data['labels'],
      test_size=test_ratio, random_state=seed)
    train = pd.concat([x_train, y_train], axis=1)
    test = pd.concat([x_test, y_test], axis=1)
    print(f'train shape: {train.shape}')
    print(f'test shape: {test.shape}')
    return train, test


def ds_split(data, test_ratio):
    """ Split column pairs into training and test samples.

    If only one dataset is present, this function will fall back
    to default train-test split.

    Args:
        data: a pandas dataframe containing all data.
        test_ratio: ratio of test cases after splitting.

    Returns:
        a tuple containing training, then test set.
    """
    # Check if multiple datasets are present
    unique_datasets = data['dataid'].nunique()

    if unique_datasets <= 1:
        print('Only one dataset detected. Falling back to default split.')
        # If only one dataset, use default train-test split
        return def_split(data, test_ratio, seed=42)

    print('Separating training and test sets by data')
    counts = data['dataid'].value_counts()
    nr_vals = len(counts)
    nr_test_ds = int(nr_vals * test_ratio)
    print(f'Nr. test data sets: {nr_test_ds}')

    ds_ids = counts.index.values.tolist()
    test_ds = rand.sample(ds_ids, nr_test_ds)
    print(f'TestDS: {test_ds}')

    def is_test(row):
        return row['dataid'] in test_ds

    data['istest'] = data.apply(is_test, axis=1)
    train = data[data['istest'] == False]
    test = data[data['istest'] == True]

    print(f'train.shape: {train.shape}')
    print(f'test.shape: {test.shape}')

    return train[
        ['column1', 'column2', 'type1', 'type2', 'labels']], test[
            ['column1', 'column2', 'type1', 'type2', 'labels']]


# ---- Prediction Methods ----

def baseline(col_pairs):
    """ A simple baseline predicting correlation via Jaccard similarity.

    Args:
        col_pairs: list of tuples with column names.

    Returns:
        list of predictions (1 for correlation, 0 for no correlation).
    """
    predictions = []
    for cp in col_pairs:
        c1 = cp[0]
        c2= cp[1]
        s1 = set(c1.split())
        s2 = set(c2.split())
        ns1 = len(s1)
        ns2 = len(s2)
        ni = len(set.intersection(s1, s2))
        # calculate Jaccard coefficient
        jac = ni / (ns1 + ns2 - ni)
        # predict correlation if similar
        if jac > 0.5:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions


# ---- Evaluation and Metrics Functions ----

def log_metrics(
        coeff, min_v1, max_v2, mod_type, mod_name, scenario,
        test_ratio, sub_test, test_name, lb, ub, pred_method,
        out_path, training_time, extra_info=None):
    """ Predicts using baseline or model, writes metrics to file.

    Args:
        (multiple arguments for logging performance metrics)
        extra_info: optional dict containing additional metrics to log
    """
    sub_test.columns = [
        'text_a', 'text_b', 'type1', 'type2', 'labels', 'length', 'nrtokens', 'prediction']
    # print out a sample for later analysis
    print(f'Sample for test {test_name}:')
    sample = sub_test.sample(frac=0.1)
    print(sample)
    # predict correlation via baseline or model
    sub_test = sub_test[['text_a', 'text_b', 'labels']]
    samples = []
    for _, r in sub_test.iterrows():
        samples.append([r['text_a'], r['text_b']])
    s_time = time.time()
    if pred_method == 0:
        preds = baseline(samples)
    else:
        preds = model.predict(samples)[0]
    # log various performance metrics
    t_time = time.time() - s_time
    nr_samples = len(sub_test.index)
    t_per_s = float(t_time) / nr_samples
    f1 = metrics.f1_score(sub_test['labels'], preds)
    pre = metrics.precision_score(sub_test['labels'], preds)
    rec = metrics.recall_score(sub_test['labels'], preds)
    acc = metrics.accuracy_score(sub_test['labels'], preds)
    mcc = metrics.matthews_corrcoef(sub_test['labels'], preds)
    # also log to local file
    with open(out_path, 'a+') as file:
        file.write(f'{coeff},{min_v1},{max_v2},"{mod_type}",' \
                f'"{mod_name}","{scenario}",{test_ratio},' \
                f'"{test_name}",{pred_method},{lb},{ub},' \
                f'{f1},{pre},{rec},{acc},{mcc},{t_per_s},' \
                f'{training_time}\n')
    # Log metrics to W&B
    wandb_metrics = {
        "Coefficient": coeff,
        "F1 Score": f1,
        "Precision": pre,
        "Recall": rec,
        "Accuracy": acc,
        "MCC": mcc,
        "Prediction Time per Sample": t_per_s,
        "Training Time": training_time,
        "Test Name": test_name,
        "Test Ratio": test_ratio
    }

    # Add extra info to wandb metrics if provided
    if extra_info:
        wandb_metrics.update(extra_info)

    wandb.log(wandb_metrics)

def names_length(row):
    """ Calculate combined length of column names. """
    return len(row['text_a']) + len(row['text_b'])


def names_tokens(row):
    """ Calculates number of tokens (separated by spaces). """
    return row['text_a'].count(' ') + row['text_b'].count(' ')


# ---- Main Pipeline Function ----

def run_correlation_prediction(
    src_path,
    coeff='theilsu',
    min_v1=0.2,
    max_v2=0.9,
    mod_type='distilbert',
    mod_name='distilbert-base-uncased',
    scenario='defsep',
    test_ratio=0.2,
    use_types=1,
    out_path='results.csv'
):
    # Initialize for deterministic results
    seed = 42
    rand.seed(seed)

    # Load and preprocess data
    data = pd.read_csv(src_path, sep=',')
    data = data.sample(frac=1, random_state=seed)

    # Rename columns to match expected format
    data.columns = [
        'dataid', 'datapath', 'nrrows', 'nrvals1', 'nrvals2',
        'type1', 'type2', 'column1', 'column2', 'method',
        'coefficient', 'pvalue', 'time'
    ]

    # Convert 'nan' strings to actual NaN values
    data['coefficient'] = pd.to_numeric(data['coefficient'], errors='coerce')
    data['pvalue'] = pd.to_numeric(data['pvalue'], errors='coerce')

    # Remove rows with NaN values
    data = data.dropna(subset=['coefficient', 'pvalue'])

    # Filter data for specific correlation method
    data = data[data['method']==coeff]
    nr_total = len(data.index)
    print(f'Nr. samples: {nr_total}')
    print('Sample from filtered data:')
    print(data.head())

    # Label data
    def coefficient_label(row):
        """ Label column pair as correlated or uncorrelated. """
        if abs(row['coefficient']) >= min_v1 and abs(row['pvalue']) <= max_v2:
            return 1
        else:
            return 0
    data['labels'] = data.apply(coefficient_label, axis=1)


    # ---- Train/Test Split ----
    if scenario == 'defsep':
        train, test = def_split(data, test_ratio, seed)
    elif scenario == 'datasep':
        train, test = ds_split(data, test_ratio)
    else:
        raise ValueError(f'Undefined scenario: {scenario}')

    train.columns = ['text_a', 'text_b', 'type1', 'type2', 'labels']
    test.columns = ['text_a', 'text_b', 'type1', 'type2', 'labels']
    print(train.head())
    print(test.head())

    # ---- Model Training ----
    # Prepare loss scaling
    lab_counts = train['labels'].value_counts()
    nr_zeros = lab_counts.loc[0]
    nr_ones = lab_counts.loc[1]
    nr_all = float(len(train.index))
    weights = [nr_all/nr_zeros, nr_all/nr_ones]

    # Train classification model
    s_time = time.time()
    model_args = ClassificationArgs(
        num_train_epochs=5, train_batch_size=100, eval_batch_size=100,
        overwrite_output_dir=True, manual_seed=seed,
        evaluate_during_training=True, no_save=False,
        weight_decay=0.01,  # L2 regularization
        early_stopping_patience=3, # Early stopping
        )
    global model  # Make model global so it can be accessed in log_metrics
    model = ClassificationModel(
        mod_type, mod_name, weight=weights,
        use_cuda=True, args=model_args)
    model.train_model(
        train_df=train, eval_df=test, acc=metrics.accuracy_score,
        rec=metrics.recall_score, pre=metrics.precision_score,
        f1=metrics.f1_score)
    training_time = time.time() - s_time

    # ---- Evaluation ----
    test['length'] = test.apply(names_length, axis=1)
    test['nrtokens'] = test.apply(names_tokens, axis=1)

    # Initialize result file
    with open(out_path, 'w') as file:
        file.write(
            'coefficient,min_v1,max_v2,mod_type,mod_name,scenario,test_ratio,'
            'test_name,pred_method,lb,ub,f1,precision,recall,accuracy,mcc,'
            'prediction_time,training_time\n')

    # Use simple baseline and model for prediction
    for m in [0, 1]:
        # Get predictions based on model or baseline
        if m == 1:
            # Use model predictions
            predictions, raw_outputs = model.predict(test[['text_a', 'text_b']].values.tolist())
            test['prediction'] = predictions
        else:
            # Use baseline predictions
            test['prediction'] = 0  # Simple baseline predicts all as uncorrelated

        # Use entire test set
        test_name = f'{m}-final'
        log_metrics(
            coeff, min_v1, max_v2, mod_type, mod_name, scenario,
            test_ratio, test, test_name, 0, 'inf', m, out_path, training_time)

        # Test for data types
        for type1 in ['object', 'float64', 'int64', 'bool']:
            for type2 in ['object', 'float64', 'int64', 'bool']:
                sub_test = test.query(f'type1=="{type1}" and type2=="{type2}"')
                if sub_test.shape[0]:
                    test_name = f'Types{type1}-{type2}'
                    log_metrics(
                        coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                        test_ratio, sub_test, test_name, -1, -1, m,
                        out_path, training_time)

        # Test for different subsets
        for q in [(0, 0.25), (0.25, 0.5), (0.5, 1)]:
            qlb = q[0]
            qub = q[1]
            # Column name length
            lb = test['length'].quantile(qlb)
            ub = test['length'].quantile(qub)
            sub_test = test[(test['length'] >= lb) & (test['length'] <= ub)]
            test_name = f'L{m}-{qlb}-{qub}'
            log_metrics(
                coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                test_ratio, sub_test, test_name, lb, ub, m, out_path, training_time)

            # Number of tokens in column names
            lb = test['nrtokens'].quantile(qlb)
            ub = test['nrtokens'].quantile(qub)
            sub_test = test[(test['nrtokens'] >= lb) & (test['nrtokens'] <= ub)]
            test_name = f'N{m}-{qlb}-{qub}'
            log_metrics(
                coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                test_ratio, sub_test, test_name, lb, ub, m, out_path, training_time)

        # Test for demographic factors
        demographic_values = [
            'sex', 'age', 'year_of_birth', 'race', 'detailed_race',
            'hispanic_origin', 'detailed_hispanic_origin', 'birthplace',
            'detailed_birthplace', 'first_ancestry', 'detailed_first_ancestry',
            'citizenship_status', 'year_of_naturalization', 'year_of_immigration',
            'years_in_usa', 'english_proficiency', 'american_indian_race',
            'asian_race', 'black_race', 'pacific_islander_race', 'white_race',
            'other_race'
        ]

        # Get overall metrics for comparison using current predictions
        overall_metrics = {
            'tp': sum((test['labels'] == 1) & (test['prediction'] == 1)),
            'fp': sum((test['labels'] == 0) & (test['prediction'] == 1)),
            'fn': sum((test['labels'] == 1) & (test['prediction'] == 0)),
            'tn': sum((test['labels'] == 0) & (test['prediction'] == 0))
        }

        def calculate_significance(subset_metrics, overall_metrics):
            # Check if any value in the contingency table is 0
            if (subset_metrics['tp'] == 0 or subset_metrics['fp'] == 0 or
                subset_metrics['fn'] == 0 or subset_metrics['tn'] == 0):
                return False

            # Perform chi-square test on confusion matrices
            chi2, p_value = chi2_contingency([
                [subset_metrics['tp'], subset_metrics['fp']],
                [subset_metrics['fn'], subset_metrics['tn']]
            ])[0:2]
            return p_value < 0.05  # Returns True if difference is significant

        # Track demographic differences
        demographic_differences = {}

        # Evaluate rows containing demographic values in text_a and text_b
        for demographic in demographic_values:
            # Find rows where either column contains the demographic term
            demographic_rows = test[
                test['text_a'].str.contains(demographic, case=False, na=False) |
                test['text_b'].str.contains(demographic, case=False, na=False)
            ]

            if not demographic_rows.empty:
                # Calculate metrics for this demographic subset
                subset_metrics = {
                    'tp': sum((demographic_rows['labels'] == 1) & (demographic_rows['prediction'] == 1)),
                    'fp': sum((demographic_rows['labels'] == 0) & (demographic_rows['prediction'] == 1)),
                    'fn': sum((demographic_rows['labels'] == 1) & (demographic_rows['prediction'] == 0)),
                    'tn': sum((demographic_rows['labels'] == 0) & (demographic_rows['prediction'] == 0))
                }

                # Calculate performance metrics
                total = sum(subset_metrics.values())
                accuracy = (subset_metrics['tp'] + subset_metrics['tn']) / total if total > 0 else 0

                # Store metrics for comparison
                demographic_differences[demographic] = {
                    'accuracy': accuracy,
                    'sample_size': total,
                    'metrics': subset_metrics
                }

                # Check if difference is statistically significant
                is_significant = calculate_significance(subset_metrics, overall_metrics)

                if is_significant:
                    test_name = f'Demo-{demographic}'
                    # Log metrics only if there's a significant difference
                    log_metrics(
                        coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                        test_ratio, demographic_rows, test_name, -1, -1, m,
                        out_path, training_time,
                        extra_info={
                            'demographic_factor': demographic,
                            'is_significant': True,
                            'confusion_matrix': subset_metrics,
                            'accuracy': accuracy
                        }
                    )

        # Print summary of demographic differences
        print(f"\nDemographic Performance Differences (Method: {'Model' if m == 1 else 'Baseline'}):")
        for demo, stats in demographic_differences.items():
            print(f"{demo}: Accuracy = {stats['accuracy']:.3f}, Sample Size = {stats['sample_size']}")


# Example usage
if __name__ == '__main__':
    # Modify these parameters as needed
    run_correlation_prediction(
        src_path='/content/Data-Analysis-with-LLM/experiment/data/corresult5.csv',
        coeff='theilsu',
        min_v1=0.2,
        max_v2=0.9,
        mod_type='distilbert',
        mod_name='distilbert-base-uncased',
        scenario='defsep',
        test_ratio=0.2,
        use_types=1,
        out_path='/content/drive/My Drive/Colab Notebooks/Liter/correlations/results/results23.csv'
    )

Mounted at /content/drive


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Nr. samples: 2556
Sample from filtered data:
      dataid                 datapath  nrrows  nrvals1  nrvals2   type1  \
1988       2  ../data/output_file.csv      63        6       21  object   
1216       2  ../data/output_file.csv      63        2        2  object   
1385       2  ../data/output_file.csv      63       38       22  object   
1334       2  ../data/output_file.csv      63       21        2  object   
175        2  ../data/output_file.csv      63        4        4  object   

       type2                       column1                  column2   method  \
1988  object      detailed_class_of_worker               birthplace  theilsu   
1216  object                   school_type  detailed_grade_attended  theilsu   
1385   int64  detailed_second_degree_field         year_of_marriage  theilsu   
1334  object           second_degree_field      married_within_year  theilsu   
175   object                marital_status       number_of_children  theilsu   

      coefficient    pv

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 4 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 5 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test 0-final:
                              text_a                          text_b    type1  \
1075              detailed_education                      white_race   object   
1450               employment_status             english_proficiency   object   
436              detailed_birthplace                household_weight   object   
2389                     wage_income              citizenship_status    int64   
2584            education_score_1990                      asian_race  float64   
1761  occupation_2010_classification  occupation_1990_classification   object   
2037                  soc_occupation                             sex   object   
1059              detailed_education                   detailed_race   object   
2233              usual_hours_worked                  fertility_year   object   
433              detailed_birthplace                     survey_year   object   
483                   first_ancestry                  fertility_year   object   
102

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                    text_a            text_b    type1  type2  labels  length  \
2564  education_score_1990  year_of_marriage  float64  int64       1      36   

      nrtokens  prediction  
2564         0           0  
Sample for test Typesint64-object:
                text_a                   text_b  type1   type2  labels  \
194      year_of_birth         us_census_region  int64  object       1   
210      year_of_birth                      age  int64  object       0   
2408       wage_income  detailed_grade_attended  int64  object       0   
275   year_of_marriage      age_of_eldest_child  int64  object       1   
1641        occupation              school_type  int64  object       0   

      length  nrtokens  prediction  
194       29         0           0  
210       16         0           0  
2408      34         0           0  
275       35         0           0  
1641      21         0           0  
Sample for test Typesint64-float64:
Empty DataFrame
Columns: [text_a, text_b, t

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                              text_a                          text_b    type1  \
1297           detailed_degree_field          year_of_naturalization   object   
2644                  veteran_status             married_within_year   object   
1489      detailed_employment_status                  fertility_year   object   
187                    year_of_birth                  dataset_number    int64   
758                       asian_race                household_weight   object   
1521      labour_force_participation                  dataset_number   object   
1879    industry_1990_classification                      asian_race   object   
1042              detailed_education  person_number_within_household   object   
2003        detailed_class_of_worker               school_attendance   object   
1057              detailed_education                  fertility_year   object   
721             american_indian_race         household_serial_number   object   
827                       bl

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Demographic Performance Differences (Method: Baseline):
sex: Accuracy = 0.889, Sample Size = 9
age: Accuracy = 0.406, Sample Size = 96
year_of_birth: Accuracy = 0.211, Sample Size = 19
race: Accuracy = 0.591, Sample Size = 110
detailed_race: Accuracy = 0.214, Sample Size = 14
hispanic_origin: Accuracy = 0.607, Sample Size = 28
detailed_hispanic_origin: Accuracy = 0.800, Sample Size = 10
birthplace: Accuracy = 0.324, Sample Size = 34
detailed_birthplace: Accuracy = 0.389, Sample Size = 18
first_ancestry: Accuracy = 0.243, Sample Size = 37
detailed_first_ancestry: Accuracy = 0.308, Sample Size = 13
citizenship_status: Accuracy = 0.615, Sample Size = 13
year_of_naturalization: Accuracy = 0.167, Sample Size = 12
year_of_immigration: Accuracy = 0.273, Sample Size = 11
years_in_usa: Accuracy = 0.111, Sample Size = 9
english_proficiency: Accuracy = 0.583, Sample Size = 12
american_indian_race: Accuracy = 0.652, Sample Size = 23
asian_race: Accuracy = 0.833, Sample Size = 18
black_race: Accur

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test 1-final:
                              text_a                          text_b    type1  \
1859    industry_1990_classification                  marital_status   object   
2116                  naics_industry          year_of_naturalization   object   
2161                    weeks_worked              number_of_siblings   object   
2565            education_score_1990                  fertility_year  float64   
1043              detailed_education                   person_weight   object   
910                       other_race                  dataset_number   object   
532               citizenship_status             age_of_eldest_child   object   
414                       birthplace                   person_weight   object   
65                       family_size                     survey_year   object   
2599            education_score_1990               employment_status  float64   
2612            education_score_1990        detailed_class_of_worker  float64   
735

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test Typesobject-object:
                              text_a                        text_b   type1  \
735             american_indian_race                           age  object   
2176                    weeks_worked                first_ancestry  object   
450              detailed_birthplace                marital_status  object   
2081                  soc_occupation  industry_1990_classification  object   
1229                    degree_field           age_of_eldest_child  object   
743             american_indian_race                 detailed_race  object   
2316                worked_last_year                    white_race  object   
1888    industry_1990_classification       detailed_grade_attended  object   
815                       black_race                          race  object   
2135                  naics_industry  detailed_second_degree_field  object   
1023                       education            citizenship_status  object   
1420               employmen

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test Typesobject-float64:
               text_a                   text_b   type1    type2  labels  \
414        birthplace            person_weight  object  float64       1   
1904  class_of_worker         household_weight  object  float64       1   
1172      school_type  household_serial_number  object  float64       1   
358   hispanic_origin         household_weight  object  float64       1   

      length  nrtokens  prediction  
414       23         0           1  
1904      31         0           1  
1172      34         0           1  
358       31         0           1  


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test Typesobject-int64:
                              text_a                          text_b   type1  \
1422               employment_status  person_number_within_household  object   
407                       birthplace                  dataset_number  object   
1191                     school_type                year_of_marriage  object   
1382    detailed_second_degree_field                   year_of_birth  object   
719             american_indian_race                     survey_year  object   
1760  occupation_2010_classification                      occupation  object   

      type2  labels  length  nrtokens  prediction  
1422  int64       0      47         0           0  
407   int64       0      24         0           0  
1191  int64       1      27         0           1  
1382  int64       1      41         0           1  
719   int64       0      31         0           0  
1760  int64       0      40         0           0  


  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test Typesfloat64-object:
                    text_a                        text_b    type1   type2  \
2572  education_score_1990                first_ancestry  float64  object   
2505       parent_presence             employment_status  float64  object   
2610  education_score_1990  industry_1990_classification  float64  object   

      labels  length  nrtokens  prediction  
2572       1      34         0           1  
2505       0      32         0           0  
2610       1      48         0           1  


  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test Typesfloat64-int64:
                    text_a        text_b    type1  type2  labels  length  \
2581  education_score_1990  years_in_usa  float64  int64       1      32   

      nrtokens  prediction  
2581         0           1  


  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test Typesint64-object:
                              text_a               text_b  type1   type2  \
45    person_number_within_household  metropolitan_status  int64  object   
208                    year_of_birth   number_of_siblings  int64  object   
1784                        industry   number_of_children  int64  object   
672                     years_in_usa        detailed_race  int64  object   
611              year_of_immigration                  age  int64  object   

      labels  length  nrtokens  prediction  
45         0      49         0           0  
208        0      31         0           0  
1784       0      26         0           0  
672        1      25         0           1  
611        1      22         0           1  


  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test Typesint64-float64:
Empty DataFrame
Columns: [text_a, text_b, type1, type2, labels, length, nrtokens, prediction]
Index: []


  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test Typesint64-int64:
            text_a          text_b  type1  type2  labels  length  nrtokens  \
187  year_of_birth  dataset_number  int64  int64       0      27         0   

     prediction  
187           0  


  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test L1-0-0.25:
                    text_a           text_b   type1   type2  labels  length  \
1260          degree_field        education  object  object       0      21   
1246          degree_field   first_ancestry  object  object       1      26   
2374           wage_income    year_of_birth   int64   int64       1      24   
2387           wage_income   first_ancestry   int64  object       1      25   
1785              industry  child_age_leq_4   int64  object       0      23   
735   american_indian_race              age  object  object       1      23   
891             white_race   fertility_year  object  object       0      24   
309                   race   dataset_number  object   int64       0      18   
1603            occupation   marital_status   int64  object       0      24   
1910       class_of_worker      family_size  object  object       1      26   
884             white_race              sex  object  object       0      13   
2069        soc_occupatio

  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test N1-0-0.25:
                              text_a                        text_b    type1  \
2132                  naics_industry                  degree_field   object   
2600            education_score_1990    detailed_employment_status  float64   
2719         detailed_veteran_status               hispanic_origin   object   
616              year_of_immigration           number_of_marriages    int64   
45    person_number_within_household           metropolitan_status    int64   
1265           detailed_degree_field                   survey_year   object   
1569      labour_force_participation         detailed_degree_field   object   
2611            education_score_1990               class_of_worker  float64   
785                       asian_race       detailed_first_ancestry   object   
1206                     school_type          american_indian_race   object   
1702  occupation_1990_classification    detailed_employment_status   object   
1027                     

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test L1-0.25-0.5:
                    text_a               text_b    type1    type2  labels  \
305         fertility_year  number_of_marriages   object   object       0   
2653        veteran_status       first_ancestry   object   object       1   
1103        grade_attended        detailed_race   object   object       0   
1087        grade_attended        person_weight   object  float64       1   
1598            occupation  age_of_eldest_child    int64   object       1   
2594  education_score_1990          school_type  float64   object       0   
2067        soc_occupation       grade_attended   object   object       0   
991      school_attendance           other_race   object   object       1   
2041        soc_occupation  number_of_marriages   object   object       0   
304         fertility_year        year_of_birth   object    int64       1   
666           years_in_usa  married_within_year    int64   object       0   
615    year_of_immigration        year_of_birth

0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test N1-0.25-0.5:
                              text_a                          text_b    type1  \
991                school_attendance                      other_race   object   
719             american_indian_race                     survey_year   object   
1375    detailed_second_degree_field                 child_age_leq_4   object   
664                     years_in_usa                   year_of_birth    int64   
911                       other_race         household_serial_number   object   
2211              usual_hours_worked                     survey_year   object   
470                   first_ancestry                     family_size   object   
494          detailed_first_ancestry                us_census_region   object   
2306                worked_last_year         detailed_first_ancestry   object   
304                   fertility_year                   year_of_birth   object   
988                school_attendance                      black_race   object   

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test L1-0.5-1:
                              text_a                          text_b    type1  \
609              year_of_immigration              number_of_siblings    int64   
2307                worked_last_year              citizenship_status   object   
2011        detailed_class_of_worker             second_degree_field   object   
2611            education_score_1990                 class_of_worker  float64   
1126         detailed_grade_attended         household_serial_number   object   
560           year_of_naturalization                     family_size   object   
1879    industry_1990_classification                      asian_race   object   
2079                  soc_occupation  occupation_2010_classification   object   
480                   first_ancestry             number_of_marriages   object   
2502                 parent_presence           detailed_degree_field  float64   
362                  hispanic_origin  person_number_within_household   object   
18

0it [00:00, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test N1-0.5-1:
                              text_a                        text_b    type1  \
2219              usual_hours_worked                 person_weight   object   
1688  occupation_1990_classification         pacific_islander_race   object   
894                       white_race               hispanic_origin   object   
1029                       education                    asian_race   object   
275                 year_of_marriage           age_of_eldest_child    int64   
884                       white_race                           sex   object   
1786                        industry           age_of_eldest_child    int64   
1488      detailed_employment_status              year_of_marriage   object   
455              detailed_birthplace                fertility_year   object   
452              detailed_birthplace           number_of_marriages   object   
2745         detailed_veteran_status  detailed_second_degree_field   object   
436              detailed_

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test Demo-age:
                              text_a                 text_b    type1   type2  \
2526                 parent_presence            wage_income  float64   int64   
849            pacific_islander_race    number_of_marriages   object  object   
1718  occupation_2010_classification  age_of_youngest_child   object  object   
1598                      occupation    age_of_eldest_child    int64  object   
283                 year_of_marriage          year_of_birth    int64   int64   
2411                     wage_income  detailed_degree_field    int64  object   
1665  occupation_1990_classification                    age   object  object   
321                             race  age_of_youngest_child   object  object   
2374                     wage_income          year_of_birth    int64   int64   
660                     years_in_usa                    age    int64  object   

      labels  length  nrtokens  prediction  
2526       1      26         0           0  
849

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Sample for test Demo-race:
                       text_a                 text_b   type1    type2  labels  \
991         school_attendance             other_race  object   object       1   
728      american_indian_race            family_size  object   object       1   
2717  detailed_veteran_status                   race  object   object       1   
1455        employment_status             white_race  object   object       0   
933                other_race          detailed_race  object   object       1   
759                asian_race       us_census_region  object   object       0   
337             detailed_race  group_quarters_status  object   object       0   
2732  detailed_veteran_status             black_race  object   object       0   
722      american_indian_race       household_weight  object  float64       1   
1206              school_type   american_indian_race  object   object       0   
378           hispanic_origin                   race  object   object       1   



  with amp.autocast():


0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]


Demographic Performance Differences (Method: Model):
sex: Accuracy = 1.000, Sample Size = 9
age: Accuracy = 0.938, Sample Size = 96
year_of_birth: Accuracy = 0.947, Sample Size = 19
race: Accuracy = 0.882, Sample Size = 110
detailed_race: Accuracy = 0.929, Sample Size = 14
hispanic_origin: Accuracy = 0.643, Sample Size = 28
detailed_hispanic_origin: Accuracy = 0.500, Sample Size = 10
birthplace: Accuracy = 0.912, Sample Size = 34
detailed_birthplace: Accuracy = 0.889, Sample Size = 18
first_ancestry: Accuracy = 0.946, Sample Size = 37
detailed_first_ancestry: Accuracy = 0.923, Sample Size = 13
citizenship_status: Accuracy = 0.769, Sample Size = 13
year_of_naturalization: Accuracy = 1.000, Sample Size = 12
year_of_immigration: Accuracy = 1.000, Sample Size = 11
years_in_usa: Accuracy = 1.000, Sample Size = 9
english_proficiency: Accuracy = 0.750, Sample Size = 12
american_indian_race: Accuracy = 0.957, Sample Size = 23
asian_race: Accuracy = 0.944, Sample Size = 18
black_race: Accuracy

  with amp.autocast():


In [26]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import json
import wandb

def process_experiment_results(results_path):
    """
    Process the experiment results and prepare data for visualization

    Args:
        results_path: Path to the CSV file containing experiment results

    Returns:
        dict: Processed metrics and analysis ready for visualization
    """
    # Read results file
    df = pd.read_csv(results_path)

    # Process overall metrics for model predictions (pred_method = 1)
    model_metrics = df[df['pred_method'] == 1].copy()

    # Calculate average metrics for overall performance
    overall_metrics = [{
        'name': 'F1 Score',
        'value': model_metrics['f1'].mean()
    }, {
        'name': 'Precision',
        'value': model_metrics['precision'].mean()
    }, {
        'name': 'Recall',
        'value': model_metrics['recall'].mean()
    }, {
        'name': 'Accuracy',
        'value': model_metrics['accuracy'].mean()
    }, {
        'name': 'MCC',
        'value': model_metrics['mcc'].mean()
    }]

    # Process demographic performance
    demographic_performance = []

    # Filter for demographic test results
    demo_results = df[df['test_name'].str.startswith('Demo-', na=False)]

    for _, row in demo_results.iterrows():
        demo_name = row['test_name'].replace('Demo-', '')

        # Extract metrics
        performance = {
            'name': demo_name,
            'accuracy': row['accuracy'],
            'sampleSize': int(row['prediction_time'] * 1000),  # Convert to approximate sample size
            'significant': True if row['mcc'] > df['mcc'].mean() else False,
            'f1': row['f1'],
            'precision': row['precision'],
            'recall': row['recall']
        }
        demographic_performance.append(performance)

    # Get training progress data from wandb
    api = wandb.Api()
    runs = api.runs("Distilbert_run23")  # Replace with your wandb project name

    # Get the most recent run
    latest_run = runs[0]
    history = latest_run.scan_history()

    # Extract training progress
    training_progress = []
    for i, row in enumerate(history):
        if 'train_loss' in row and 'eval_loss' in row:
            training_progress.append({
                'epoch': i + 1,
                'trainLoss': row['train_loss'],
                'evalLoss': row['eval_loss']
            })

    # Calculate significance tests for demographic differences
    def calculate_demographic_significance(group_metrics, overall_metrics):
        """Calculate statistical significance of demographic differences"""
        contingency_table = [
            [group_metrics['tp'], group_metrics['fp']],
            [group_metrics['fn'], group_metrics['tn']]
        ]

        chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
        return p_value < 0.05

    # Process demographic significance
    demographic_significance = {}
    for demo in demographic_performance:
        # Extract confusion matrix metrics (you'll need to modify this based on your actual data structure)
        group_metrics = {
            'tp': demo.get('true_positives', 0),
            'fp': demo.get('false_positives', 0),
            'fn': demo.get('false_negatives', 0),
            'tn': demo.get('true_negatives', 0)
        }

        # Calculate significance
        is_significant = calculate_demographic_significance(group_metrics, overall_metrics)
        demographic_significance[demo['name']] = is_significant

    # Prepare final output
    visualization_data = {
        'overall_metrics': overall_metrics,
        'demographic_performance': demographic_performance,
        'demographic_significance': demographic_significance,
        'training_progress': training_progress
    }

    return visualization_data

def save_visualization_data(data, output_path):
    """Save processed data to JSON file for visualization"""

    # Convert NumPy booleans to Python booleans
    def convert(o):
        if isinstance(o, np.bool_):
            return bool(o)
        raise TypeError

    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2, default=convert)

def generate_summary_report(data):
    """Generate a text summary of the key findings"""
    summary = []

    # Overall performance summary
    avg_metrics = {metric['name']: metric['value'] for metric in data['overall_metrics']}
    summary.append("Overall Performance Summary:")
    for name, value in avg_metrics.items():
        summary.append(f"- {name}: {value:.3f}")

    # Demographic analysis summary
    summary.append("\nDemographic Analysis Summary:")
    demo_metrics = data['demographic_performance']
    for demo in demo_metrics:
        significance = "Significant" if demo['significant'] else "Not Significant"
        summary.append(
            f"- {demo['name']}: Accuracy={demo['accuracy']:.3f}, "
            f"Sample Size={demo['sampleSize']}, {significance}"
        )

    # Training progress summary
    train_progress = data['training_progress']
    if train_progress:
        final_epoch = train_progress[-1]
        summary.append("\nTraining Summary:")
        summary.append(
            f"- Final Training Loss: {final_epoch['trainLoss']:.3f}"
            f"- Final Evaluation Loss: {final_epoch['evalLoss']:.3f}"
        )

    return "\n".join(summary)

def main():
    # Configure paths
    results_path = '/content/drive/My Drive/Colab Notebooks/Liter/correlations/results/results23.csv'  # Update with your results path
    output_path = 'visualization_data.json'

    # Process the results
    print("Processing experiment results...")
    visualization_data = process_experiment_results(results_path)

    # Save processed data
    print("Saving processed data...")
    save_visualization_data(visualization_data, output_path)

    # Generate and print summary report
    print("\nAnalysis Summary:")
    summary = generate_summary_report(visualization_data)
    print(summary)

    # Save summary report
    with open('analysis_summary.txt', 'w') as f:
        f.write(summary)

    print("\nProcessing complete. Data saved for visualization.")

if __name__ == "__main__":
    main()

Processing experiment results...
True
True
Saving processed data...

Analysis Summary:
Overall Performance Summary:
- F1 Score: 0.918
- Precision: 0.943
- Recall: 0.899
- Accuracy: 0.926
- MCC: 0.741

Demographic Analysis Summary:
- age: Accuracy=0.938, Sample Size=81, Significant
- race: Accuracy=0.882, Sample Size=88, Significant

Processing complete. Data saved for visualization.


  expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
