<a href="https://colab.research.google.com/github/VinKKAP/Data-Analysis-with-LLM/blob/main/Experiment_Run1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
!git clone https://github.com/VinKKAP/Data-Analysis-with-LLM.git

Cloning into 'Data-Analysis-with-LLM'...
remote: Enumerating objects: 427, done.[K
remote: Counting objects: 100% (52/52), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 427 (delta 8), reused 45 (delta 5), pack-reused 375 (from 1)[K
Receiving objects: 100% (427/427), 57.93 MiB | 20.14 MiB/s, done.
Resolving deltas: 100% (192/192), done.
Updating files: 100% (93/93), done.


In [3]:
!pip install -r /content/Data-Analysis-with-LLM/experiment/requirements.txt

Collecting datasets (from -r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting simpletransformers (from -r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 6))
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->-r /content/Data-Analysis-with-LLM/experiment/requirements.txt (line 1))
  Downloading multiprocess

In [None]:
import torch
print(torch.cuda.is_available())

True


In [None]:
'''
Created on Aug 12, 2023

@author: immanueltrummer
added Dez 2024
@added: vincentkapp
'''
from multiprocessing import set_start_method
try:
    set_start_method('spawn')
except RuntimeError:
    pass

import argparse
import sklearn.metrics as metrics
import pandas as pd
import random as rand
import time
from scipy.stats import chi2_contingency

from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
userdata.get('HF_TOKEN')
import wandb


# Initialize wandb run
wandb.init(project="Distilbert_run1")


def add_type(row):

    """ Enrich column name by adding column type.

    Args:
        row: describes correlation between two columns.

    Returns:
        row with enriched column names.
    """
    row['column1'] = row['column1'] + ' ' + row['type1']
    row['column2'] = row['column2'] + ' ' + row['type2']
    return row


def def_split(data, test_ratio, seed):
    """ Split data into training and test set.

    With this approach, different column pairs from the
    same data set may appear in training and test set.

    Args:
        data: a pandas dataframe containing all data.
        test_ratio: ratio of test cases after split.
        seed: random seed for deterministic results.

    Returns:
        a tuple containing training, then test data.
    """
    print('Data sets in training and test may overlap')
    x_train, x_test, y_train, y_test = train_test_split(
      data[['column1', 'column2', 'type1', 'type2']], data['label'],
      test_size=test_ratio, random_state=seed)
    train = pd.concat([x_train, y_train], axis=1)
    test = pd.concat([x_test, y_test], axis=1)
    print(f'train shape: {train.shape}')
    print(f'test shape: {test.shape}')
    return train, test


def ds_split(data, test_ratio):
    """ Split column pairs into training and test samples.

    With this method, training and test set contain columns
    of disjunct data sets, making prediction a bit harder.

    Args:
        data: a pandas dataframe containing all data.
        test_ratio: ratio of test cases after splitting.

    Returns:
        a tuple containing training, then test set.
    """
    print('Separating training and test sets by data')
    counts = data['dataid'].value_counts()
    print(f'Counts: {counts}')
    print(f'Count.index: {counts.index}')
    print(f'Count.index.values: {counts.index.values}')
    print(f'counts.shape: {counts.shape}')
    print(f'counts.iloc[0]: {counts.iloc[0]}')
    nr_vals = len(counts)
    nr_test_ds = int(nr_vals * test_ratio)
    print(f'Nr. test data sets: {nr_test_ds}')
    ds_ids = counts.index.values.tolist()
    print(type(ds_ids))
    print(ds_ids)
    test_ds = rand.sample(ds_ids, nr_test_ds)
    print(f'TestDS: {test_ds}')

    def is_test(row):
        if row['dataid'] in test_ds:
            return True
        else:
            return False

    data['istest'] = data.apply(is_test, axis=1)
    train = data[data['istest'] == False]
    test = data[data['istest'] == True]
    print(f'train.shape: {train.shape}')
    print(f'test.shape: {test.shape}')
    print(train)
    print(test)
    return train[
        ['column1', 'column2', 'type1', 'type2', 'label']], test[
            ['column1', 'column2', 'type1', 'type2', 'label']]


def baseline(col_pairs):
    """ A simple baseline predicting correlation via Jaccard similarity.

    Args:
        col_pairs: list of tuples with column names.

    Returns:
        list of predictions (1 for correlation, 0 for no correlation).
    """
    predictions = []
    for cp in col_pairs:
        c1 = cp[0]
        c2= cp[1]
        s1 = set(c1.split())
        s2 = set(c2.split())
        ns1 = len(s1)
        ns2 = len(s2)
        ni = len(set.intersection(s1, s2))
        # calculate Jaccard coefficient
        jac = ni / (ns1 + ns2 - ni)
        # predict correlation if similar
        if jac > 0.5:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions


# log all metrics into summary for data subset
def log_metrics(
        coeff, min_v1, max_v2, mod_type, mod_name, scenario,
        test_ratio, sub_test, test_name, lb, ub, pred_method,
        out_path, training_time):
    """ Predicts using baseline or model, writes metrics to file.

    Args:
        coeff: predict correlation according to this coefficient.
        min_v1: lower bound on coefficient value for correlation.
        max_v2: upper bound on p-value to be considered correlated.
        mod_type: base type of language model used for prediction.
        mod_name: precise name of language model used for prediction.
        scenario: how training and test data relate to each other.
        test_ratio: ratio of column pairs used for testing (not training).
        sub_test: data frame with test cases, possibly a subset.
        test_name: write this test name into result file.
        lb: lower bound on a test-specific metric constraining test cases.
        ub: upper bound on test-specific metric, constraining test cases.
        pred_metho: whether to use language model or simple baseline.
        out_path: path to result output file (results are appended).
    """
    sub_test.columns = [
        'text_a', 'text_b', 'type1', 'type2', 'labels', 'length', 'nrtokens']
    # print out a sample for later analysis
    print(f'Sample for test {test_name}:')
    sample = sub_test.sample(frac=0.1)
    print(sample)
    # predict correlation via baseline or model
    sub_test = sub_test[['text_a', 'text_b', 'labels']]
    samples = []
    for _, r in sub_test.iterrows():
        samples.append([r['text_a'], r['text_b']])
    s_time = time.time()
    if pred_method == 0:
        preds = baseline(samples)
    else:
        preds = model.predict(samples)[0]
    # log various performance metrics
    t_time = time.time() - s_time
    nr_samples = len(sub_test.index)
    t_per_s = float(t_time) / nr_samples
    f1 = metrics.f1_score(sub_test['labels'], preds)
    pre = metrics.precision_score(sub_test['labels'], preds)
    rec = metrics.recall_score(sub_test['labels'], preds)
    acc = metrics.accuracy_score(sub_test['labels'], preds)
    mcc = metrics.matthews_corrcoef(sub_test['labels'], preds)

    # Create a file path within the out_path directory
    file_path = os.path.join(out_path, 'results.csv')

    # Now use the file_path instead of out_path
    with open(file_path, 'a+') as file:
        file.write(f'{coeff},{min_v1},{max_v2},"{mod_type}",' \
                f'"{mod_name}","{scenario}",{test_ratio},' \
                f'"{test_name}",{pred_method},{lb},{ub},' \
                f'{f1},{pre},{rec},{acc},{mcc},{t_per_s},' \
                f'{training_time}\n')


def names_length(row):
    """ Calculate combined length of column names.

    Args:
        row: contains information on one column pair.

    Returns:
        combined length of column names (in characters).
    """
    return len(row['text_a']) + len(row['text_b'])

def names_tokens(row):
    """ Calculates number of tokens (separated by spaces).

    Attention: this is not the number of tokens as calculated
    by the tokenizer of the language model but an approximation.

    Args:
        row: contains information on one column pair.

    Returns:
        number of space-separated substrings in both column names.
    """
    return row['text_a'].count(' ') + row['text_b'].count(' ')


def chi_square_test(predictions, labels, sex):
    """ Perform Chi-Square test to check if there is a significant difference between sexes in predictions.

    Args:
        predictions: list of model predictions.
        labels: list of true labels.
        sex: list of sex corresponding to each prediction.

    Returns:
        p-value of the Chi-Square test.
    """
    contingency_table = pd.crosstab(pd.Series(sex), pd.Series(predictions))
    chi2, p, _, _ = chi2_contingency(contingency_table)
    return p


def run_experiment(src_path, coeff, min_v1, max_v2, mod_type, mod_name, scenario, test_ratio, use_types, out_path):
    # print parameters
    print(f'Coefficients: {coeff}')
    print(f'Minimal value 1: {min_v1}')
    print(f'Maximal value 2: {max_v2}')
    print(f'Model type: {mod_type}')
    print(f'Model name: {mod_name}')
    print(f'Scenario: {scenario}')
    print(f'Test ratio: {test_ratio}')

    # initialize for deterministic results
    seed = 42
    rand.seed(seed)

    # load data
    data = pd.read_csv(src_path, sep = ',')
    data = data.sample(frac=1, random_state=seed)
    data.columns = [
        'dataid', 'datapath', 'nrrows', 'nrvals1', 'nrvals2',
        'type1', 'type2', 'column1', 'column2', 'method',
        'coefficient', 'pvalue', 'time']

    # enrich column names if activated
    if use_types:
        data = data.apply(add_type, axis=1)

    # Initialize the tokenizer and model from the pre-trained model name
    tokenizer = AutoTokenizer.from_pretrained(mod_name)
    model = AutoModelForSequenceClassification.from_pretrained(mod_name, num_labels=2)

    # Set a padding token if not already set
    if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
    # Update the model configuration to include the pad_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

    # Check if GPU is available and set the device accordingly
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Move the model to the GPU if available
    model.to(device)

    # filter data
    data = data[data['method']==coeff]
    nr_total = len(data.index)
    print(f'Nr. samples: {nr_total}')
    print('Sample from filtered data:')
    print(data.head())

    # label data
    def coefficient_label(row):
        """ Label column pair as correlated or uncorrelated.

        Args:
            row: describes correlation between column pair.

        Returns:
            1 if correlated, 0 if not correlated.
        """
        if abs(row['coefficient']) >= min_v1 and abs(row['pvalue']) <= max_v2:
            return 1
        else:
            return 0
    data['label'] = data.apply(coefficient_label, axis=1)

    # split into test and training
    if scenario == 'defsep':
        train, test = def_split(data, test_ratio, seed)
    elif scenario == 'datasep':
        train, test = ds_split(data, test_ratio)
    else:
        raise ValueError(f'Undefined scenario: {scenario}')

    train.columns = ['text_a', 'text_b', 'type1', 'type2', 'labels']
    test.columns = ['text_a', 'text_b', 'type1', 'type2', 'labels']
    print(train.head())
    print(test.head())

    # prepare dataset for transformers
    train_encodings = tokenizer(train['text_a'].tolist(), train['text_b'].tolist(), truncation=True, padding=True)
    test_encodings = tokenizer(test['text_a'].tolist(), test['text_b'].tolist(), truncation=True, padding=True)

    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = Dataset(train_encodings, train['labels'].tolist())
    test_dataset = Dataset(test_encodings, test['labels'].tolist())

    # prepare loss scaling
    lab_counts = train['labels'].value_counts()
    # prepare loss scaling
    lab_counts = train['labels'].value_counts()
    nr_zeros = lab_counts.loc[0] if 0 in lab_counts else 0  # Handle case where 0 is not in index
    nr_ones = lab_counts.loc[1] if 1 in lab_counts else 0  # Handle case where 1 is not in index

    nr_all = float(len(train.index))
    # Avoid division by zero if nr_zeros or nr_ones is 0
    weights = [nr_all / nr_zeros if nr_zeros else 1, nr_all / nr_ones if nr_ones else 1]

    # train classification model
    s_time = time.time()
    model_args = ClassificationArgs(
        num_train_epochs=5, train_batch_size=100, eval_batch_size=100,
        overwrite_output_dir=True, manual_seed=seed,
        evaluate_during_training=True, no_save=True)
    model = ClassificationModel(
        mod_type, mod_name, weight=weights,
        use_cuda = True, args=model_args)
    model.train_model(
        train_df=train, eval_df=test, acc=metrics.accuracy_score,
        rec=metrics.recall_score, pre=metrics.precision_score,
        f1=metrics.f1_score)
    training_time = time.time() - s_time

    test['length'] = test.apply(names_length, axis=1)
    test['nrtokens'] = test.apply(names_tokens, axis=1)

    # Initialize result file
    result_file_path = os.path.join(out_path, 'results.csv') # Create the full file path
    with open(result_file_path, 'w') as file:  # open the file using the full path
        file.write(
            'coefficient,min_v1,max_v2,mod_type,mod_name,scenario,test_ratio,'
            'test_name,pred_method,lb,ub,f1,precision,recall,accuracy,mcc,'
            'prediction_time,training_time\n')

    # use simple baseline and model for prediction
    for m in [0, 1]:
        # use entire test set (redundant - for verification)
        test_name = f'{m}-final'
        log_metrics(
            coeff, min_v1, max_v2, mod_type, mod_name, scenario,
            test_ratio, test, test_name, 0, 'inf', m, out_path, training_time)

        # test for data types
        for type1 in ['object', 'float64', 'int64', 'bool']:
            for type2 in ['object', 'float64', 'int64', 'bool']:
                sub_test = test.query(f'type1=="{type1}" and type2=="{type2}"')
                if sub_test.shape[0]:
                    test_name = f'Types{type1}-{type2}'
                    log_metrics(
                        coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                        test_ratio, sub_test, test_name, -1, -1, m,
                        args.out_path)

        # test for different subsets
        for q in [(0, 0.25), (0.25, 0.5), (0.5, 1)]:
            qlb = q[0]
            qub = q[1]
            # column name length
            lb = test['length'].quantile(qlb)
            ub = test['length'].quantile(qub)
            sub_test = test[(test['length'] >= lb) & (test['length'] <= ub)]
            test_name = f'L{m}-{qlb}-{qub}'
            log_metrics(
                coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                test_ratio, sub_test, test_name, lb, ub, m, args.out_path)
            # number of tokens in column names
            lb = test['nrtokens'].quantile(qlb)
            ub = test['nrtokens'].quantile(qub)
            sub_test = test[(test['nrtokens'] >= lb) & (test['nrtokens'] <= ub)]
            test_name = f'N{m}-{qlb}-{qub}'
            log_metrics(
                coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                test_ratio, sub_test, test_name, lb, ub, m, args.out_path)

    # Collect predictions and perform Chi-Square test
    all_preds = []
    all_labels = []
    all_sex = []
    for sex in test['sex'].unique():
        sub_test = test[test['sex'] == sex]
        samples = [[r['text_a'], r['text_b']] for _, r in sub_test.iterrows()]
        preds = model.predict(samples)[0]
        all_preds.extend(preds)
        all_labels.extend(sub_test['labels'].tolist())
        all_sex.extend([sex] * len(preds))

    p_value = chi_square_test(all_preds, all_labels, all_sex)
    print(f'Chi-Square test p-value: {p_value}')

    def compute_metrics(p):
      preds = p.predictions.argmax(-1)
      precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
      acc = accuracy_score(p.label_ids, preds)
      return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
# Ensure the output and logging directories exist
output_dir = '/content/drive/My Drive/Colab Notebooks/Liter/correlations/results'
logging_dir = '/content/drive/My Drive/Colab Notebooks/Liter/correlations/logs'

os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)
# Example usage in a Jupyter Notebook or Google Colab
args = {
    "src_path": "/content/Data-Analysis-with-LLM/experiment/data/corresult_output_file.csv",
    "coeff": "pearson",
    "min_v1": 0.9,
    "max_v2": 0.05,
    "mod_type": "distilbert",
    "mod_name": "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    "scenario": "defsep",
    "test_ratio": 0.2,
    "use_types": 1,
    "out_path": "/content/drive/My Drive/Colab Notebooks/Liter/correlations/models"
}

run_experiment(**args)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Coefficients: pearson
Minimal value 1: 0.9
Maximal value 2: 0.05
Model type: distilbert
Model name: distilbert/distilbert-base-uncased-finetuned-sst-2-english
Scenario: defsep
Test ratio: 0.2
Nr. samples: 14
Sample from filtered data:
    dataid                 datapath   nrrows  nrvals1  nrvals2    type1  \
5        2  ../data/output_file.csv  3156487  1380233        1  float64   
50       2  ../data/output_file.csv  3156487     1237        1  float64   
35       2  ../data/output_file.csv  3156487       20        1    int64   
56       2  ../data/output_file.csv  3156487     1237      992  float64   
47       2  ../data/output_file.csv  3156487     1237        1  float64   

      type2         column1        column2   method  coefficient  pvalue  \
5     int64  serial float64  datanum int64  pearson          NaN     NaN   
50    int64   perwt float64  datanum int64  pearson          NaN     NaN   
35    int64    pernum int64  datanum int64  pearson          NaN     NaN   
56  float6

0it [00:00, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Running Epoch 2 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Running Epoch 3 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Running Epoch 4 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

  with amp.autocast():
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Running Epoch 5 of 5:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


0it [00:00, ?it/s]

Sample for test 0-final:
Empty DataFrame
Columns: [text_a, text_b, type1, type2, labels, length, nrtokens]
Index: []


  with amp.autocast():
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AttributeError: 'dict' object has no attribute 'out_path'

In [5]:
src_path='/content/Data-Analysis-with-LLM/experiment/data/corresult5.csv'
data = pd.read_csv(src_path, sep=',')
print("Columns in data:", data.columns.tolist())
print("\nFirst few rows:")
print(data.head())

Columns in data: ['2', '../data/output_file.csv', '63', '1', '1.1', 'int64', 'int64.1', 'dataset_number', 'survey_year', 'pearson', 'nan', 'nan.1', '0.00220489501953125']

First few rows:
   2  ../data/output_file.csv  63   1  1.1    int64 int64.1  \
0  2  ../data/output_file.csv  63   1    1    int64   int64   
1  2  ../data/output_file.csv  63   1    1    int64   int64   
2  2  ../data/output_file.csv  63  63    1  float64   int64   
3  2  ../data/output_file.csv  63  63    1  float64   int64   
4  2  ../data/output_file.csv  63  63    1  float64   int64   

            dataset_number  survey_year   pearson  nan  nan.1  \
0           dataset_number  survey_year  spearman  NaN    NaN   
1           dataset_number  survey_year   theilsu  1.0    1.0   
2  household_serial_number  survey_year   pearson  NaN    NaN   
3  household_serial_number  survey_year  spearman  NaN    NaN   
4  household_serial_number  survey_year   theilsu  0.0    1.0   

   0.00220489501953125  
0             0.0

In [None]:
# load data
data = pd.read_csv('/content/Data-Analysis-with-LLM/experiment/data/corresult_output_file.csv', sep = ',')
seed = 42
data = data.sample(frac=1, random_state=seed)
# Print the existing columns to inspect them
print(f"Existing columns: {data.columns}")
# The original DataFrame has 13 columns, but you were trying to assign 14 new column names.
# Removed 'sex' from the list as it is likely not present in the original data.
data.columns = [
    'dataid', 'datapath', 'nrrows', 'nrvals1', 'nrvals2',
    'type1', 'type2', 'column1', 'column2', 'method',
    'coefficient', 'pvalue', 'time'
]

Existing columns: Index(['2', '../data/output_file.csv', '3156487', '1', '1.1', 'int64',
       'int64.1', 'datanum', 'year', 'pearson', 'nan', 'nan.1',
       '0.022938013076782227'],
      dtype='object')


In [7]:
'''
Correlation Prediction Script
This script predicts correlations between column names using a DistilBERT model and Theil's U values.
The code is organized into several main sections:

1. Data Preprocessing
   - Loading and filtering data
   - Adding column type information
   - Labeling correlations (binary classification)

2. Train/Test Split Functions
   - Default split (def_split)
   - Dataset-aware split (ds_split)

3. Prediction Methods
   - Baseline using Jaccard similarity
   - DistilBERT model predictions

4. Evaluation and Metrics
   - Performance metrics calculation
   - Results logging
   - Wandb integration for experiment tracking

5. Main Pipeline
   - Data loading and preprocessing
   - Model training
   - Evaluation across different data subsets
'''

from multiprocessing import set_start_method
try:
    set_start_method('spawn')
except RuntimeError:
    pass

import sklearn.metrics as metrics
import pandas as pd
import random as rand
import time

from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from google.colab import drive
drive.mount('/content/drive')
from google.colab import userdata
userdata.get('HF_TOKEN')
import wandb
wandb.init(project="Distilbert_run20") # Initialize wandb run


# ---- Train/Test Split Functions ----

def def_split(data, test_ratio, seed):
    """ Split data into training and test set. """
    print('Data sets in training and test may overlap')

    # Make sure we have the label column before splitting
    if 'label' not in data.columns:
        raise ValueError("'label' column must be created before splitting the data")

    # Select only the columns we need
    features = ['column1', 'column2', 'type1', 'type2']
    available_features = [col for col in features if col in data.columns]

    x_train, x_test, y_train, y_test = train_test_split(
        data[available_features],
        data['label'],
        test_size=test_ratio,
        random_state=seed
    )

    train = pd.concat([x_train, y_train], axis=1)
    test = pd.concat([x_test, y_test], axis=1)
    print(f'train shape: {train.shape}')
    print(f'test shape: {test.shape}')
    return train, test


def ds_split(data, test_ratio):
    """ Split column pairs into training and test samples.

    If only one dataset is present, this function will fall back
    to default train-test split.

    Args:
        data: a pandas dataframe containing all data.
        test_ratio: ratio of test cases after splitting.

    Returns:
        a tuple containing training, then test set.
    """
    # Check if multiple datasets are present
    unique_datasets = data['dataid'].nunique()

    if unique_datasets <= 1:
        print('Only one dataset detected. Falling back to default split.')
        # If only one dataset, use default train-test split
        return def_split(data, test_ratio, seed=42)

    print('Separating training and test sets by data')
    counts = data['dataid'].value_counts()
    nr_vals = len(counts)
    nr_test_ds = int(nr_vals * test_ratio)
    print(f'Nr. test data sets: {nr_test_ds}')

    ds_ids = counts.index.values.tolist()
    test_ds = rand.sample(ds_ids, nr_test_ds)
    print(f'TestDS: {test_ds}')

    def is_test(row):
        return row['dataid'] in test_ds

    data['istest'] = data.apply(is_test, axis=1)
    train = data[data['istest'] == False]
    test = data[data['istest'] == True]

    print(f'train.shape: {train.shape}')
    print(f'test.shape: {test.shape}')

    return train[
        ['column1', 'column2', 'type1', 'type2', 'label']], test[
            ['column1', 'column2', 'type1', 'type2', 'label']]


# ---- Prediction Methods ----

def baseline(col_pairs):
    """ A simple baseline predicting correlation via Jaccard similarity.

    Args:
        col_pairs: list of tuples with column names.

    Returns:
        list of predictions (1 for correlation, 0 for no correlation).
    """
    predictions = []
    for cp in col_pairs:
        c1 = cp[0]
        c2= cp[1]
        s1 = set(c1.split())
        s2 = set(c2.split())
        ns1 = len(s1)
        ns2 = len(s2)
        ni = len(set.intersection(s1, s2))
        # calculate Jaccard coefficient
        jac = ni / (ns1 + ns2 - ni)
        # predict correlation if similar
        if jac > 0.5:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions


# ---- Evaluation and Metrics Functions ----

def log_metrics(
        coeff, min_v1, max_v2, mod_type, mod_name, scenario,
        test_ratio, sub_test, test_name, lb, ub, pred_method,
        out_path, training_time):
    """ Predicts using baseline or model, writes metrics to file.

    Args:
        (multiple arguments for logging performance metrics)
    """
    sub_test.columns = [
        'text_a', 'text_b', 'type1', 'type2', 'labels', 'length', 'nrtokens']
    # print out a sample for later analysis
    print(f'Sample for test {test_name}:')
    sample = sub_test.sample(frac=0.1)
    print(sample)
    # predict correlation via baseline or model
    sub_test = sub_test[['text_a', 'text_b', 'labels']]
    samples = []
    for _, r in sub_test.iterrows():
        samples.append([r['text_a'], r['text_b']])
    s_time = time.time()
    if pred_method == 0:
        preds = baseline(samples)
    else:
        preds = model.predict(samples)[0]
    # log various performance metrics
    t_time = time.time() - s_time
    nr_samples = len(sub_test.index)
    t_per_s = float(t_time) / nr_samples
    f1 = metrics.f1_score(sub_test['labels'], preds)
    pre = metrics.precision_score(sub_test['labels'], preds)
    rec = metrics.recall_score(sub_test['labels'], preds)
    acc = metrics.accuracy_score(sub_test['labels'], preds)
    mcc = metrics.matthews_corrcoef(sub_test['labels'], preds)
    # also log to local file
    with open(out_path, 'a+') as file:
        file.write(f'{coeff},{min_v1},{max_v2},"{mod_type}",' \
                f'"{mod_name}","{scenario}",{test_ratio},' \
                f'"{test_name}",{pred_method},{lb},{ub},' \
                f'{f1},{pre},{rec},{acc},{mcc},{t_per_s},' \
                f'{training_time}\n')
    # Log metrics to W&B
    wandb.log({
        "Coefficient": coeff,
        "F1 Score": f1,
        "Precision": pre,
        "Recall": rec,
        "Accuracy": acc,
        "MCC": mcc,
        "Prediction Time per Sample": t_per_s,
        "Training Time": training_time,
        "Test Name": test_name,
        "Test Ratio": test_ratio
    })

def names_length(row):
    """ Calculate combined length of column names. """
    return len(row['text_a']) + len(row['text_b'])


def names_tokens(row):
    """ Calculates number of tokens (separated by spaces). """
    return row['text_a'].count(' ') + row['text_b'].count(' ')


# ---- Main Pipeline Function ----


def run_correlation_prediction(
    src_path,
    coeff='theilsu',
    min_v1=0.15,
    max_v2=1,
    mod_type='distilbert',
    mod_name='distilbert-base-uncased',
    scenario='defsep',
    test_ratio=0.2,
    use_types=1,
    out_path='results.csv'
):
    # Initialize for deterministic results
    seed = 42
    rand.seed(seed)

    # Load and preprocess data
    print("Loading data...")
    data = pd.read_csv(src_path, sep=',')
    print("Original columns:", data.columns.tolist())

    data = data.sample(frac=1, random_state=seed)

    # Rename columns to match expected format
    data.columns = [
        'dataid', 'datapath', 'nrrows', 'nrvals1', 'nrvals2',
        'type1', 'type2', 'column1', 'column2', 'method',
        'coefficient', 'pvalue', 'time'
    ]
    print("Renamed columns:", data.columns.tolist())

    # Convert 'nan' strings to actual NaN values
    data['coefficient'] = pd.to_numeric(data['coefficient'], errors='coerce')
    data['pvalue'] = pd.to_numeric(data['pvalue'], errors='coerce')

    # Remove rows with NaN values
    data = data.dropna(subset=['coefficient', 'pvalue'])

    # Filter data for specific correlation method
    data = data[data['method']==coeff]

    # Create label column BEFORE any splitting
    print("Creating labels...")
    data['label'] = data.apply(
        lambda row: 1 if abs(row['coefficient']) >= min_v1 and abs(row['pvalue']) <= max_v2 else 0,
        axis=1
    )

    # Enrich column names if activated
    if use_types:
        print("Enriching column names with types...")
        data['column1'] = data['column1'] + ' ' + data['type1']
        data['column2'] = data['column2'] + ' ' + data['type2']

    nr_total = len(data.index)
    print(f'Nr. samples: {nr_total}')
    print('Sample from processed data:')
    print(data.head())

    # Add debug prints to check data state
    print("\nColumns after preprocessing:", data.columns.tolist())
    print("Number of samples with label 1:", sum(data['label'] == 1))
    print("Number of samples with label 0:", sum(data['label'] == 0))

    # Perform train/test split
    if scenario == 'defsep':
        train, test = def_split(data, test_ratio, seed)
    elif scenario == 'datasep':
        train, test = ds_split(data, test_ratio)
    else:
        raise ValueError(f'Undefined scenario: {scenario}')


    # ---- Train/Test Split ----
    if scenario == 'defsep':
        train, test = def_split(data, test_ratio, seed)
    elif scenario == 'datasep':
        train, test = ds_split(data, test_ratio)
    else:
        raise ValueError(f'Undefined scenario: {scenario}')

    train.columns = ['text_a', 'text_b', 'type1', 'type2', 'labels']
    test.columns = ['text_a', 'text_b', 'type1', 'type2', 'labels']
    print(train.head())
    print(test.head())

    # ---- Model Training ----
    # Prepare loss scaling
    lab_counts = train['labels'].value_counts()
    nr_zeros = lab_counts.loc[0]
    nr_ones = lab_counts.loc[1]
    nr_all = float(len(train.index))
    weights = [nr_all/nr_zeros, nr_all/nr_ones]

    # Train classification model
    s_time = time.time()
    model_args = ClassificationArgs(
        num_train_epochs=5, train_batch_size=100, eval_batch_size=100,
        overwrite_output_dir=True, manual_seed=seed,
        evaluate_during_training=True, no_save=False,
        weight_decay=0.01,  # L2 regularization
        early_stopping_patience=3, # Early stopping
        )
    global model  # Make model global so it can be accessed in log_metrics
    model = ClassificationModel(
        mod_type, mod_name, weight=weights,
        use_cuda=True, args=model_args)
    model.train_model(
        train_df=train, eval_df=test, acc=metrics.accuracy_score,
        rec=metrics.recall_score, pre=metrics.precision_score,
        f1=metrics.f1_score)
    training_time = time.time() - s_time

    # ---- Evaluation ----
    test['length'] = test.apply(names_length, axis=1)
    test['nrtokens'] = test.apply(names_tokens, axis=1)

    # Initialize result file
    with open(out_path, 'w') as file:
        file.write(
            'coefficient,min_v1,max_v2,mod_type,mod_name,scenario,test_ratio,'
            'test_name,pred_method,lb,ub,f1,precision,recall,accuracy,mcc,'
            'prediction_time,training_time\n')

    # Use simple baseline and model for prediction
    for m in [0, 1]:
        # Use entire test set
        test_name = f'{m}-final'
        log_metrics(
            coeff, min_v1, max_v2, mod_type, mod_name, scenario,
            test_ratio, test, test_name, 0, 'inf', m, out_path, training_time)

        # Test for data types
        for type1 in ['object', 'float64', 'int64', 'bool']:
            for type2 in ['object', 'float64', 'int64', 'bool']:
                sub_test = test.query(f'type1=="{type1}" and type2=="{type2}"')
                if sub_test.shape[0]:
                    test_name = f'Types{type1}-{type2}'
                    log_metrics(
                        coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                        test_ratio, sub_test, test_name, -1, -1, m,
                        out_path, training_time)

        # Test for different subsets
        for q in [(0, 0.25), (0.25, 0.5), (0.5, 1)]:
            qlb = q[0]
            qub = q[1]
            # Column name length
            lb = test['length'].quantile(qlb)
            ub = test['length'].quantile(qub)
            sub_test = test[(test['length'] >= lb) & (test['length'] <= ub)]
            test_name = f'L{m}-{qlb}-{qub}'
            log_metrics(
                coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                test_ratio, sub_test, test_name, lb, ub, m, out_path, training_time)

            # Number of tokens in column names
            lb = test['nrtokens'].quantile(qlb)
            ub = test['nrtokens'].quantile(qub)
            sub_test = test[(test['nrtokens'] >= lb) & (test['nrtokens'] <= ub)]
            test_name = f'N{m}-{qlb}-{qub}'
            log_metrics(
                coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                test_ratio, sub_test, test_name, lb, ub, m, out_path, training_time)
        # Test for demographic factors
        for col in ['sex', 'age', 'year_of_birth', 'race', 'detailed_race',
                    'hispanic_origin', 'detailed_hispanic_origin', 'birthplace',
                    'detailed_birthplace', 'first_ancestry', 'detailed_first_ancestry',
                    'citizenship_status', 'year_of_naturalization', 'year_of_immigration',
                    'years_in_usa', 'english_proficiency', 'american_indian_race',
                    'asian_race', 'black_race', 'pacific_islander_race', 'white_race',
                    'other_race']:
            if col in test.columns:
                # Get unique values in the demographic column
                unique_values = test[col].unique()

                for value in unique_values:
                    sub_test = test[test[col] == value]
                    if sub_test.shape[0]:
                        # Calculate confusion matrix metrics
                        subset_metrics = {
                            'tp': sum((sub_test['label'] == 1) & (sub_test['prediction'] == 1)),
                            'fp': sum((sub_test['label'] == 0) & (sub_test['prediction'] == 1)),
                            'fn': sum((sub_test['label'] == 1) & (sub_test['prediction'] == 0)),
                            'tn': sum((sub_test['label'] == 0) & (sub_test['prediction'] == 0))
                        }

                        # Check if difference is significant
                        is_significant = calculate_significance(subset_metrics, overall_metrics)

                        test_name = f'Demo-{col}-{value}'
                        # Log metrics with additional information about significance
                        log_metrics(
                            coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                            test_ratio, sub_test, test_name, -1, -1, m,
                            out_path, training_time,
                            extra_info={
                                'demographic_factor': col,
                                'value': value,
                                'is_significant': is_significant,
                                'confusion_matrix': subset_metrics
                            })

        # Function to calculate statistical significance
        def calculate_significance(subset_metrics, overall_metrics):
            from scipy import stats
            # Perform chi-square test on confusion matrices
            chi2, p_value = stats.chi2_contingency([
                [subset_metrics['tp'], subset_metrics['fp']],
                [subset_metrics['fn'], subset_metrics['tn']]
            ])[0:2]
            return p_value < 0.05  # Returns True if difference is significant

        # Get overall metrics for comparison
        overall_metrics = {
            'tp': sum((test['label'] == 1) & (test['prediction'] == 1)),
            'fp': sum((test['label'] == 0) & (test['prediction'] == 1)),
            'fn': sum((test['label'] == 1) & (test['prediction'] == 0)),
            'tn': sum((test['label'] == 0) & (test['prediction'] == 0))
        }

        for demo_category, columns in demographic_mapping.items():
            for col in columns:
                if col in test.columns:
                    # Get unique values in the demographic column
                    unique_values = test[col].unique()

                    for value in unique_values:
                        sub_test = test[test[col] == value]
                        if sub_test.shape[0]:
                            # Calculate confusion matrix metrics
                            subset_metrics = {
                                'tp': sum((sub_test['label'] == 1) & (sub_test['prediction'] == 1)),
                                'fp': sum((sub_test['label'] == 0) & (sub_test['prediction'] == 1)),
                                'fn': sum((sub_test['label'] == 1) & (sub_test['prediction'] == 0)),
                                'tn': sum((sub_test['label'] == 0) & (sub_test['prediction'] == 0))
                            }

                            # Check if difference is significant
                            is_significant = calculate_significance(subset_metrics, overall_metrics)

                            test_name = f'Demo-{col}-{value}'
                            # Log metrics with additional information about significance
                            log_metrics(
                                coeff, min_v1, max_v2, mod_type, mod_name, scenario,
                                test_ratio, sub_test, test_name, -1, -1, m,
                                out_path, training_time,
                                extra_info={
                                    'demographic_factor': col,
                                    'value': value,
                                    'is_significant': is_significant,
                                    'confusion_matrix': subset_metrics
                                }
                            )


# Example usage
if __name__ == '__main__':
    # Modify these parameters as needed
    run_correlation_prediction(
        src_path='/content/Data-Analysis-with-LLM/experiment/data/corresult5.csv',
        coeff='theilsu',
        min_v1=0.15,
        max_v2=1,
        mod_type='distilbert',
        mod_name='distilbert-base-uncased',
        scenario='defsep',
        test_ratio=0.2,
        use_types=1,
        out_path='/content/drive/My Drive/Colab Notebooks/Liter/correlations/results/results20.csv'
    )

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading data...
Original columns: ['2', '../data/output_file.csv', '63', '1', '1.1', 'int64', 'int64.1', 'dataset_number', 'survey_year', 'pearson', 'nan', 'nan.1', '0.00220489501953125']
Renamed columns: ['dataid', 'datapath', 'nrrows', 'nrvals1', 'nrvals2', 'type1', 'type2', 'column1', 'column2', 'method', 'coefficient', 'pvalue', 'time']
Creating labels...
Enriching column names with types...
Nr. samples: 2556
Sample from processed data:
      dataid                 datapath  nrrows  nrvals1  nrvals2   type1  \
1988       2  ../data/output_file.csv      63        6       21  object   
1216       2  ../data/output_file.csv      63        2        2  object   
1385       2  ../data/output_file.csv      63       38       22  object   
1334       2  ../data/output_file.csv      63       21        2  object   
175        2  ../data/output_file.csv      63      

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 4 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 5 of 5:   0%|          | 0/21 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Sample for test 0-final:
                                     text_a  \
1075              detailed_education object   
1450               employment_status object   
436              detailed_birthplace object   
2389                      wage_income int64   
2584           education_score_1990 float64   
1761  occupation_2010_classification object   
2037                  soc_occupation object   
1059              detailed_education object   
2233              usual_hours_worked object   
433              detailed_birthplace object   
483                   first_ancestry object   
1027                       education object   
1495      detailed_employment_status object   
2702         detailed_veteran_status object   
1979        detailed_class_of_worker object   
1788                         industry int64   
315                             race object   
1617                       occupation int64   
666                      years_in_usa int64   
1304           detailed_degree_fiel

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Sample for test Typesfloat64-object:
                            text_a                       text_b    type1  \
2477       parent_presence float64   detailed_birthplace object  float64   
2599  education_score_1990 float64     employment_status object  float64   
2583  education_score_1990 float64  american_indian_race object  float64   

       type2  labels  length  nrtokens  
2477  object       1      49         2  
2599  object       0      52         2  
2583  object       0      55         2  
Sample for test Typesfloat64-int64:
                            text_a                  text_b    type1  type2  \
2564  education_score_1990 float64  year_of_marriage int64  float64  int64   

      labels  length  nrtokens  
2564       1      50         2  
Sample for test Typesint64-object:
                      text_a                          text_b  type1   type2  \
194      year_of_birth int64         us_census_region object  int64  object   
210      year_of_birth int64              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Sample for test L0-0-0.25:
                              text_a                     text_b   type1  \
1924          class_of_worker object                race object  object   
1917          class_of_worker object                 age object  object   
660               years_in_usa int64                 age object   int64   
2108           naics_industry object       detailed_race object  object   
2387               wage_income int64      first_ancestry object   int64   
414                birthplace object      person_weight float64  object   
1280    detailed_degree_field object                 sex object  object   
1784                  industry int64  number_of_children object   int64   
2118           naics_industry object         years_in_usa int64  object   
513   detailed_first_ancestry object                race object  object   
1455        employment_status object          white_race object  object   
2168             weeks_worked object     year_of_marriage int64  object  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Sample for test N0-0.25-0.5:
                                     text_a  \
1668  occupation_1990_classification object   
2654                  veteran_status object   
2295                worked_last_year object   
315                             race object   
1658  occupation_1990_classification object   
1215                     school_type object   
1437               employment_status object   
1724  occupation_2010_classification object   
1904                 class_of_worker object   
2564           education_score_1990 float64   
1617                       occupation int64   
2108                  naics_industry object   
2403                      wage_income int64   
2397                      wage_income int64   
194                     year_of_birth int64   
1121                  grade_attended object   
1710  occupation_2010_classification object   
1295           detailed_degree_field object   
379                  hispanic_origin object   
561           year_of_naturaliz

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyError: 'label'