## 2-level classification dataset creation


In [90]:
import pandas as pd

# Load your dataset
df = pd.read_csv('./df_review_orig.csv')

# Rename columns for ease of use, based on exact names observed in the DataFrame
df.rename(columns={
    'Sentence':'sentence',
    'Label':'label',
}, inplace=True)

essential_columns = [
    'sentence', 'label'                            
]

# Filter the DataFrame to include only rows with non-NA values in these columns
haber_data = df.dropna(subset=essential_columns)

# Select only the essential columns and drop rows with any NaN values in these columns
haber_data = haber_data[essential_columns].dropna()

# Map the labels to their corresponding categories
label_mapping = {
    "None: The linking sentence does not imply in any way a causal relationship was identified.": "0",
    "Weak: The linking sentence might imply a causal relationship was identified, but it is unclear or possible to come to that conclusion in the absence of any causal inference.": "1",
    "Moderate: The linking sentence mostly implies a causal relationship was identified, but it is unclear or possible to come to that conclusion in the absence of any causal inference.": "2",
    "Strong: The linking sentence clearly implies that causality had been identified.": "3"
}

# Apply the label mapping
haber_data['label'] = haber_data['label'].map(label_mapping)

# Save the cleaned and filtered DataFrame
haber_data.to_csv('haber_data.csv', index=False)



In [141]:
import pandas as pd
from IPython.display import display

# Load the datasets
haber_data = pd.read_csv('./haber_data.csv')
press_release_data = pd.read_excel('./Press_release_data.xlsx')
pubmed_data = pd.read_csv('./Pubmed data.csv')
ssc_data=pd.read_excel('./labeled_ssc_data.xlsx')

# Add a source column to identify where each entry came from
haber_data['source'] = 'haber'
press_release_data['source'] = 'press_release'
pubmed_data['source'] = 'pubmed'
ssc_data['source'] = 'ssc'

# Combine all datasets into one DataFrame
combined_data = pd.concat([haber_data, press_release_data, pubmed_data, ssc_data], ignore_index=True)
combined_data = combined_data.drop_duplicates()

# Shuffle the combined dataset
combined_data_shuffled = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the shuffled combined dataset to a new CSV file in the current directory
output_file_path = './full dataset 4 classes.csv'
combined_data_shuffled.to_csv(output_file_path, index=False)

# Compute the label distribution per source and per label
label_distribution_per_source = combined_data_shuffled.groupby(['source', 'label']).size().unstack(fill_value=0)

# Print the label distribution per source
print("\nLabel distribution per source:")
display(label_distribution_per_source)

# Optionally save the label distribution to a CSV file
label_distribution_file_path = './label_distribution_per_source.csv'
label_distribution_per_source.to_csv(label_distribution_file_path)




Label distribution per source:


label,0,1,2,3
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
haber,592,1045,1059,706
press_release,484,738,284,567
pubmed,1353,494,213,995
ssc,4,56,168,197


In [142]:
combined_data_shuffled

Unnamed: 0,sentence,label,source
0,Impact of bariatric surgery on health depends ...,3,press_release
1,Three quarters of pregnant women take sick lea...,3,press_release
2,surgeon specialization accounted for 9% (coron...,2,haber
3,""" We confirmed the robust association of MTNR1...",3,pubmed
4,"In addition, higher order interaction analyses...",0,pubmed
...,...,...,...
8950,"To our knowledge, this is the first study to i...",0,pubmed
8951,Short intervals between pregnancies result in ...,3,press_release
8952,Beneficial effects of surgery for epilepsy are...,3,press_release
8953,Longitudinal evidence of telomere length track...,2,haber


In [157]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1) Drop all sentences with label 0
combined_data_shuffled = combined_data_shuffled[
    combined_data_shuffled['label'].isin([1, 2, 3])
]

# 2) Split the original dataset into training (70%) and remaining (30%)
train_df, remaining_df = train_test_split(
    combined_data_shuffled,
    test_size=0.3,
    random_state=42,
    stratify=combined_data_shuffled['label']
)

# 3) Split the remaining 30% into validation (10%) and test (20%)
val_df, test_df = train_test_split(
    remaining_df,
    test_size=2/3,  # so overall test = 20%, val = 10%
    random_state=42,
    stratify=remaining_df['label']
)

# Capture duplicates before dropping
train_duplicates = train_df[train_df.duplicated(keep='first')]
val_duplicates = val_df[val_df.duplicated(keep='first')]
test_duplicates = test_df[test_df.duplicated(keep='first')]

# 4) Drop duplicates
train_df = train_df.drop_duplicates()
val_df   = val_df.drop_duplicates()
test_df  = test_df.drop_duplicates()

# ----------------------------------------------------------------------
# Function: pure undersampling of non-SSC to achieve perfect label balance.
#
#  - Keep all SSC rows for each label (no removal).
#  - For each label (1,2,3):
#    * Let total_label = (ssc_label_count + non_ssc_label_count).
#    * We'll find target = min(total_label(1), total_label(2), total_label(3)).
#    * For each label, we reduce non-SSC so that the final label count = target.
#  - If ssc_label_count for a label is already larger than 'target',
#    pure undersampling won't fix it. That situation is impossible to balance
#    without dropping SSC or oversampling other labels.
# ----------------------------------------------------------------------
def undersample_non_ssc_for_perfect_balance(df):
    # Separate SSC from non-SSC
    ssc_df     = df[df['source'] == 'ssc']
    non_ssc_df = df[df['source'] != 'ssc']
    
    # Count how many SSC rows for each label
    ssc_label_counts = ssc_df['label'].value_counts().reindex([1,2,3], fill_value=0)
    # Count how many non-SSC rows for each label
    non_ssc_label_counts = non_ssc_df['label'].value_counts().reindex([1,2,3], fill_value=0)
    
    # Current total per label = (SSC count + non-SSC count)
    current_totals = ssc_label_counts + non_ssc_label_counts
    
    # We'll set the final "target" to be the *smallest* of these 3 totals
    # so all labels 1,2,3 end up with the same final count
    target = current_totals.min()
    
    # If for any label the ssc_label_counts[label] > target,
    # it's impossible to fix with only undersampling non-SSC,
    # since we won't remove or reduce SSC. We'll just raise an error or warning.
    for label_val in [1, 2, 3]:
        if ssc_label_counts[label_val] > target:
            # Can't fix that with pure undersampling, because that
            # label already has more SSC than 'target'.
            print(f"WARNING: Label {label_val} in SSC alone "
                  f"exceeds target {target}. "
                  f"Cannot achieve perfect balance without "
                  f"dropping some SSC or oversampling others.")
            # We'll return df unchanged or you might choose to raise an exception
            return df
    
    # Now we can safely undersample non-SSC for each label to meet 'target'
    balanced_non_ssc_parts = []
    
    for label_val in [1, 2, 3]:
        # Subset of non-SSC for this label
        label_subset = non_ssc_df[non_ssc_df['label'] == label_val]
        
        # How many from SSC do we have?
        ssc_count_for_label = ssc_label_counts[label_val]
        
        # So we want: final label count = target
        # That means non-SSC portion must be: target - ssc_count_for_label
        desired_non_ssc = target - ssc_count_for_label
        
        if len(label_subset) > desired_non_ssc:
            # Undersample
            label_subset = label_subset.sample(n=desired_non_ssc, random_state=42)
        
        # If label_subset is already <= desired_non_ssc, we keep it as is
        # (no need to oversample).
        
        balanced_non_ssc_parts.append(label_subset)
    
    # Combine undersampled (or as-is) non-SSC from all labels
    balanced_non_ssc_df = pd.concat(balanced_non_ssc_parts, ignore_index=True)
    
    # Finally, combine with all SSC rows
    final_df = pd.concat([ssc_df, balanced_non_ssc_df], ignore_index=True)
    
    return final_df


# ---------------------------------------------------------
# 5) Balance each split by pure undersampling non-SSC
# ---------------------------------------------------------
balanced_train_df = undersample_non_ssc_for_perfect_balance(train_df)
balanced_val_df   = undersample_non_ssc_for_perfect_balance(val_df)
balanced_test_df  = undersample_non_ssc_for_perfect_balance(test_df)

# ---------------------------------------------------------
# 7) Create the SSC Test Set (from test set only)
# ---------------------------------------------------------
# Extract SSC data from the balanced test set
ssc_test_df = balanced_test_df[balanced_test_df['source'] == 'ssc']

# ---------------------------------------------------------
# 6) Calculate label distributions for train, val, test
# ---------------------------------------------------------
train_label_dist = balanced_train_df.groupby(['source','label']).size().unstack(fill_value=0)
val_label_dist   = balanced_val_df.groupby(['source','label']).size().unstack(fill_value=0)
test_label_dist  = balanced_test_df.groupby(['source','label']).size().unstack(fill_value=0)
test_ssc_label_dist  = ssc_test_df.groupby(['source','label']).size().unstack(fill_value=0)


train_label_dist.loc['All'] = train_label_dist.sum()
val_label_dist.loc['All']   = val_label_dist.sum()
test_label_dist.loc['All']  = test_label_dist.sum()


print("=== Training ===")
print(train_label_dist)
print("\n=== Validation ===")
print(val_label_dist)
print("\n=== Test ===")
print(test_label_dist)
print("\n=== SSC Test ===")
print(test_ssc_label_dist)

print(f"\nTrain size: {len(balanced_train_df)}")
print(f"Val size:   {len(balanced_val_df)}")
print(f"Test size:  {len(balanced_test_df)}")
print(f"SSC Test size:  {len(ssc_test_df)}")



# ---------------------------------------------------------
# 8) Save the Balanced Splits to CSV
# ---------------------------------------------------------
balanced_train_df.to_csv('./3_balanced_trainv2.csv', index=False)
balanced_val_df.to_csv('./3_balanced_valv2.csv', index=False)
balanced_test_df.to_csv('./3_balanced_testv2.csv', index=False)
# Save the SSC test set to a CSV file
ssc_test_df.to_csv('./3_ssc_testv2.csv', index=False)

# ---------------------------------------------------------
# 9) Check final overall distribution if needed
# ---------------------------------------------------------
combined_balanced_df = pd.concat([balanced_train_df, balanced_val_df, balanced_test_df], ignore_index=True)
combined_label_dist  = combined_balanced_df.groupby(['source','label']).size().unstack(fill_value=0)
combined_label_dist.loc['All'] = combined_label_dist.sum()
print("\n=== Combined ===")
print(combined_label_dist)
print(f"\nCombined size: {len(combined_balanced_df)}")


=== Training ===
label             1     2     3
source                         
haber           539   753   345
press_release   353   184   260
pubmed          273   153   465
ssc              42   117   137
All            1207  1207  1207

=== Validation ===
label            1    2    3
source                      
haber           68  112   48
press_release   62   31   42
pubmed          39    9   66
ssc              3   20   16
All            172  172  172

=== Test ===
label            1    2    3
source                      
haber          153  194   89
press_release  116   69   80
pubmed          65   51  132
ssc             11   31   44
All            345  345  345

=== SSC Test ===
label    1   2   3
source            
ssc     11  31  44

Train size: 3621
Val size:   516
Test size:  1035
SSC Test size:  86

=== Combined ===
label             1     2     3
source                         
haber           760  1059   482
press_release   531   284   382
pubmed          377   213   

In [154]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1) Drop all sentences with label 0
combined_data_shuffled = combined_data_shuffled[
    combined_data_shuffled['label'].isin([1, 2, 3])
]

# 2) Split the original dataset into training (70%) and remaining (30%)
train_df, remaining_df = train_test_split(
    combined_data_shuffled,
    test_size=0.3,
    random_state=42,
    stratify=combined_data_shuffled['label']
)

# 3) Split the remaining 30% into validation (10%) and test (20%)
val_df, test_df = train_test_split(
    remaining_df,
    test_size=2/3,  # so overall test = 20%, val = 10%
    random_state=42,
    stratify=remaining_df['label']
)

# Count duplicates per 'sentence' before dropping them
train_duplicates = train_df[train_df.duplicated(subset='sentence', keep=False)]
val_duplicates = val_df[val_df.duplicated(subset='sentence', keep=False)]
test_duplicates = test_df[test_df.duplicated(subset='sentence', keep=False)]

# 4) Count the number of duplicates per source
train_duplicates_count = train_duplicates.groupby('source')['sentence'].count()
val_duplicates_count = val_duplicates.groupby('source')['sentence'].count()
test_duplicates_count = test_duplicates.groupby('source')['sentence'].count()

# Drop duplicates
train_df = train_df.drop_duplicates(subset='sentence')
val_df   = val_df.drop_duplicates(subset='sentence')
test_df  = test_df.drop_duplicates(subset='sentence')

# ----------------------------------------------------------------------
# Function: pure undersampling of non-SSC to achieve perfect label balance.
# ----------------------------------------------------------------------
def undersample_non_ssc_for_perfect_balance(df):
    ssc_df     = df[df['source'] == 'ssc']
    non_ssc_df = df[df['source'] != 'ssc']
    
    ssc_label_counts = ssc_df['label'].value_counts().reindex([1,2,3], fill_value=0)
    non_ssc_label_counts = non_ssc_df['label'].value_counts().reindex([1,2,3], fill_value=0)
    
    current_totals = ssc_label_counts + non_ssc_label_counts
    target = current_totals.min()
    
    for label_val in [1, 2, 3]:
        if ssc_label_counts[label_val] > target:
            print(f"WARNING: Label {label_val} in SSC alone "
                  f"exceeds target {target}. "
                  f"Cannot achieve perfect balance without "
                  f"dropping some SSC or oversampling others.")
            return df
    
    balanced_non_ssc_parts = []
    
    for label_val in [1, 2, 3]:
        label_subset = non_ssc_df[non_ssc_df['label'] == label_val]
        ssc_count_for_label = ssc_label_counts[label_val]
        desired_non_ssc = target - ssc_count_for_label
        
        if len(label_subset) > desired_non_ssc:
            label_subset = label_subset.sample(n=desired_non_ssc, random_state=42)
        
        balanced_non_ssc_parts.append(label_subset)
    
    balanced_non_ssc_df = pd.concat(balanced_non_ssc_parts, ignore_index=True)
    final_df = pd.concat([ssc_df, balanced_non_ssc_df], ignore_index=True)
    
    return final_df

# ---------------------------------------------------------
# 5) Balance each split by pure undersampling non-SSC
# ---------------------------------------------------------
balanced_train_df = undersample_non_ssc_for_perfect_balance(train_df)
balanced_val_df   = undersample_non_ssc_for_perfect_balance(val_df)
balanced_test_df  = undersample_non_ssc_for_perfect_balance(test_df)

# ---------------------------------------------------------
# 7) Create the SSC Test Set (from test set only)
# ---------------------------------------------------------
ssc_test_df = balanced_test_df[balanced_test_df['source'] == 'ssc']

# ---------------------------------------------------------
# 6) Calculate label distributions for train, val, test
# ---------------------------------------------------------
train_label_dist = balanced_train_df.groupby(['source','label']).size().unstack(fill_value=0)
val_label_dist   = balanced_val_df.groupby(['source','label']).size().unstack(fill_value=0)
test_label_dist  = balanced_test_df.groupby(['source','label']).size().unstack(fill_value=0)
test_ssc_label_dist  = ssc_test_df.groupby(['source','label']).size().unstack(fill_value=0)

train_label_dist.loc['All'] = train_label_dist.sum()
val_label_dist.loc['All']   = val_label_dist.sum()
test_label_dist.loc['All']  = test_label_dist.sum()

# Print duplicates count per source
print("=== Duplicates Count per Source ===")
print(f"\nTrain Set Duplicates per Source:")
print(train_duplicates_count)

print(f"\nValidation Set Duplicates per Source:")
print(val_duplicates_count)

print(f"\nTest Set Duplicates per Source:")
print(test_duplicates_count)

# Print duplicate sentences that were removed
print("\n=== Duplicate Sentences Removed ===")
print("\nTrain Duplicates Sentences:")
print(train_duplicates[['source', 'sentence']])

print("\nValidation Duplicates Sentences:")
print(val_duplicates[['source', 'sentence']])

print("\nTest Duplicates Sentences:")
print(test_duplicates[['source', 'sentence']])

print("\n=== Training ===")
print(train_label_dist)
print("\n=== Validation ===")
print(val_label_dist)
print("\n=== Test ===")
print(test_label_dist)
print("\n=== SSC Test ===")
print(test_ssc_label_dist)

print(f"\nTrain size: {len(balanced_train_df)}")
print(f"Val size:   {len(balanced_val_df)}")
print(f"Test size:  {len(balanced_test_df)}")
print(f"SSC Test size:  {len(ssc_test_df)}")

# ---------------------------------------------------------
# 8) Save the Balanced Splits to CSV
# ---------------------------------------------------------
balanced_train_df.to_csv('./3_balanced_trainv2.csv', index=False)
balanced_val_df.to_csv('./3_balanced_valv2.csv', index=False)
balanced_test_df.to_csv('./3_balanced_testv2.csv', index=False)
ssc_test_df.to_csv('./3_ssc_testv2.csv', index=False)

# ---------------------------------------------------------
# 9) Check final overall distribution if needed
# ---------------------------------------------------------
combined_balanced_df = pd.concat([balanced_train_df, balanced_val_df, balanced_test_df], ignore_index=True)
combined_label_dist  = combined_balanced_df.groupby(['source','label']).size().unstack(fill_value=0)
combined_label_dist.loc['All'] = combined_label_dist.sum()
print("\n=== Combined ===")
print(combined_label_dist)
print(f"\nCombined size: {len(combined_balanced_df)}")


=== Duplicates Count per Source ===

Train Set Duplicates per Source:
source
haber     326
pubmed      1
ssc        14
Name: sentence, dtype: int64

Validation Set Duplicates per Source:
source
haber    14
Name: sentence, dtype: int64

Test Set Duplicates per Source:
source
haber    20
ssc       2
Name: sentence, dtype: int64

=== Duplicate Sentences Removed ===

Train Duplicates Sentences:
     source                                           sentence
819   haber  Nonalcoholic fatty liver disease was independe...
1120  haber  These results provide clear evidence for a Dep...
6642  haber  Liraglutide added to high-dose insulin therapy...
5710  haber  Chronic inflammation, as ascertained by repeat...
7643  haber  Rates of diagnosis and treatment of ADHD are h...
...     ...                                                ...
8734  haber  Only DASI scores were associated with predicti...
1281  haber  The risk of non-fatal venous thromboembolism a...
8641  haber  First, second and third tr

## 2-level classification dataset creation


In [139]:
import os
import pandas as pd

# Define dataset path
data_path = './'

# List of social science dataset files
social_science_files = ['ssc_train.csv', 'ssc_val.csv', 'ssc_test (1).csv']

# Store cleaned DataFrames
ssc_dataframes = []

for file in social_science_files:
    file_path = os.path.join(data_path, file)

    # Ensure file exists
    if not os.path.exists(file_path):
        print(f"Warning: {file} not found. Skipping...")
        continue

    # Load dataset
    df = pd.read_csv(file_path)

    # Print existing columns for debugging
    print(f"Columns in {file}: {df.columns.tolist()}")

    # Check if expected columns exist, else print available ones
    if 'text' in df.columns and 'label' in df.columns:
        df = df[['text', 'label']].copy()  # Keep only necessary columns
        df.rename(columns={'text': 'sentence'}, inplace=True)
    elif 'sentence' in df.columns and 'label' in df.columns:
        df = df[['sentence', 'label']].copy()  # Keep only necessary columns
    else:
        print(f"Skipping {file}: Required columns ('text' or 'sentence', 'label') not found.")
        continue  # Skip this file if it doesn't have the required columns

    # Convert labels to integer
    df['label'] = df['label'].astype(int)

    # Append cleaned DataFrame
    ssc_dataframes.append(df)

# Combine all social science datasets
if ssc_dataframes:
    combined_ssc_df = pd.concat(ssc_dataframes, ignore_index=True)

    # Save the final combined dataset
    output_file = os.path.join(data_path, 'ssc_data_all.csv')
    combined_ssc_df.to_csv(output_file, index=False)
    print(f"\nCombined dataset saved as: {output_file}")

    # Print total label distribution
    ssc_total_label_distribution = combined_ssc_df['label'].value_counts()
    print("\nTotal label distribution for the combined social science dataset:")
    print(ssc_total_label_distribution)
else:
    print("No valid social science datasets found.")


Columns in ssc_train.csv: ['text', 'label']
Columns in ssc_val.csv: ['text', 'label']
Columns in ssc_test (1).csv: ['text', 'label']

Combined dataset saved as: ./ssc_data_all.csv

Total label distribution for the combined social science dataset:
label
1    529
0    529
Name: count, dtype: int64


In [144]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define dataset path
data_path = './'

# List of dataset files with their sources
file_names = {
    'altlex': ['altlex_train.csv', 'altlex_test.csv'],
    'because': ['because_train.csv', 'because_test.csv'],
    'ctb': ['ctb_train.csv', 'ctb_test.csv'],
    'esl2': ['esl2_train.csv', 'esl2_test.csv'],
    'semeval': ['semeval2010t8_train.csv', 'semeval2010t8_test.csv']
}

# Initialize dictionaries to store dataframes and logs
dataframes = {}
duplicates_removed = {}
nans_removed = {}

# Function to check and rename necessary columns
def check_and_rename_columns(df, file):
    if 'text' in df.columns:
        df.rename(columns={'text': 'sentence'}, inplace=True)
    elif 'sentence' not in df.columns:
        raise KeyError(f"Expected column 'sentence' missing in file: {file}")

    if 'seq_label' in df.columns:
        df.rename(columns={'seq_label': 'label'}, inplace=True)
    elif 'category' in df.columns:
        df.rename(columns={'category': 'label'}, inplace=True)

    if 'label' not in df.columns:
        raise KeyError(f"Missing expected column 'label' in file: {file}. Available columns: {df.columns.tolist()}")

    return df

# Function to ensure labels are only 0 or 1
def fix_labels(df, file):
    if df['label'].dtype != int:
        df['label'] = df['label'].astype(int)  

    if df['label'].max() > 1:
        print(f"Fixing labels in {file}: converting labels 1, 2, 3 → 1 and keeping label 0.")
        df['label'] = df['label'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

    return df

# Function to remove NaNs
def remove_nans(df, file):
    nan_rows = df[df[['sentence', 'label']].isnull().any(axis=1)]
    if not nan_rows.empty:
        print(f"NaN values found in {file}. Dropping {len(nan_rows)} rows.")
        nans_removed[file] = len(nan_rows)
        df = df.dropna(subset=['sentence', 'label'])
    return df

# Function to remove duplicates
def remove_duplicates(df, source):
    duplicate_rows = df[df.duplicated(subset=['sentence', 'label'], keep='first')]
    if not duplicate_rows.empty:
        print(f"Found {len(duplicate_rows)} duplicate sentences in {source}. Removing them.")
        duplicates_removed[source] = len(duplicate_rows)
        df = df.drop_duplicates(subset=['sentence', 'label'])
    return df

# Process each dataset
for corpus, files in file_names.items():
    corpus_dataframes = []
    
    for file in files:
        file_path = os.path.join(data_path, file)
        if not os.path.exists(file_path):
            print(f"Warning: File {file} not found. Skipping...")
            continue  

        df = pd.read_csv(file_path)
        df = check_and_rename_columns(df, file)
        df = fix_labels(df, file)
        df = remove_nans(df, file)
        df['source'] = corpus  
        df = remove_duplicates(df, corpus)
        corpus_dataframes.append(df)

    if corpus_dataframes:
        combined_corpus_df = pd.concat(corpus_dataframes, ignore_index=True)
        combined_corpus_df = combined_corpus_df[combined_corpus_df['label'].isin([0, 1])]
        dataframes[corpus] = combined_corpus_df
    else:
        print(f"Warning: No valid data found for {corpus}. Skipping...")

# Process press release dataset (Excel file)
press_release_data_path = './press_release_Data.xlsx'
if os.path.exists(press_release_data_path):
    press_release_df = pd.read_excel(press_release_data_path)
    press_release_df = check_and_rename_columns(press_release_df, "press_release")
    press_release_df = fix_labels(press_release_df, "press_release")
    press_release_df = remove_nans(press_release_df, "press_release")
    press_release_df = remove_duplicates(press_release_df, "press_release")
    press_release_df['source'] = 'Press_Release'
else:
    print("Warning: Press release dataset not found. Skipping...")
    press_release_df = None

# Load additional datasets
additional_datasets = ["Pubmed data.csv", "haber_data.csv", "ssc_data_all.csv"]
extra_dataframes = []

for dataset in additional_datasets:
    file_path = os.path.join(data_path, dataset)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df = check_and_rename_columns(df, dataset)
        df = fix_labels(df, dataset)
        df = remove_nans(df, dataset)
        df = remove_duplicates(df, dataset)
        df['source'] = dataset.split("_")[0].capitalize()
        extra_dataframes.append(df)
    else:
        print(f"Warning: {dataset} not found. Skipping...")

# Combine all datasets
final_combined_df = pd.concat(
    [*dataframes.values(), *extra_dataframes, press_release_df] if press_release_df is not None else [*dataframes.values(), *extra_dataframes], 
    ignore_index=True
)

# Print pre-split statistics
print("\n### Dataset BEFORE Splitting ###")
print(final_combined_df.groupby(["source", "label"]).size().unstack().fillna(0))
print("\nTotal label distribution before splitting:")
print(final_combined_df['label'].value_counts())

# Split into train, validation, and test
train_df, remaining_df = train_test_split(final_combined_df, test_size=0.3, random_state=42, stratify=final_combined_df['label'])
validation_df, test_df = train_test_split(remaining_df, test_size=2/3, random_state=42, stratify=remaining_df['label'])

# Function to balance dataset
def balance_dataset(df):
    causal_df = df[df['label'] == 1]
    non_causal_df = df[df['label'] == 0]
    min_count = min(len(causal_df), len(non_causal_df))
    return pd.concat([causal_df.sample(n=min_count, random_state=42), non_causal_df.sample(n=min_count, random_state=42)])

# Balance train, validation, and test sets
balanced_train_df = balance_dataset(train_df)
balanced_validation_df = balance_dataset(validation_df)
balanced_test_df = balance_dataset(test_df)

# ** Create Social Science Test Set **
ssc_test_df = test_df[test_df['source'] == 'Ssc']  # Extract SSC data
ssc_balanced_test_df = balance_dataset(ssc_test_df)  # Balance it

# Save datasets with only 3 columns: sentence, label, source
balanced_train_df[['sentence', 'label', 'source']].to_csv(data_path + '2_final_train_dataset.csv', index=False)
balanced_validation_df[['sentence', 'label', 'source']].to_csv(data_path + '2_final_validation_dataset.csv', index=False)
balanced_test_df[['sentence', 'label', 'source']].to_csv(data_path + '2_final_test_dataset.csv', index=False)
ssc_balanced_test_df[['sentence', 'label', 'source']].to_csv(data_path + '2_final_social_science_test_dataset.csv', index=False)

# Print post-split statistics
print("\n### Dataset AFTER Splitting ###")
for name, df in [("Train", balanced_train_df), ("Validation", balanced_validation_df), ("Test", balanced_test_df), ("SSC Test", ssc_balanced_test_df)]:
    print(f"\n{name} set distribution:")
    print(df.groupby(["source", "label"]).size().unstack().fillna(0))
    print(f"\nLabel distribution in {name} set:")
    print(df['label'].value_counts())

print("\nFinal dataset saved successfully.")


Found 34 duplicate sentences in altlex. Removing them.
Found 15 duplicate sentences in altlex. Removing them.
Found 174 duplicate sentences in because. Removing them.
Found 3 duplicate sentences in because. Removing them.
Found 1432 duplicate sentences in ctb. Removing them.
Found 176 duplicate sentences in ctb. Removing them.
Found 15423 duplicate sentences in esl2. Removing them.
Found 1736 duplicate sentences in esl2. Removing them.
Found 25 duplicate sentences in semeval. Removing them.
Found 2 duplicate sentences in semeval. Removing them.
Fixing labels in press_release: converting labels 1, 2, 3 → 1 and keeping label 0.
Found 3 duplicate sentences in press_release. Removing them.
Fixing labels in Pubmed data.csv: converting labels 1, 2, 3 → 1 and keeping label 0.
Found 6 duplicate sentences in Pubmed data.csv. Removing them.
Fixing labels in haber_data.csv: converting labels 1, 2, 3 → 1 and keeping label 0.
Found 969 duplicate sentences in haber_data.csv. Removing them.

### Data