In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
import pickle

def save_split_data(train_data, val_data, test_data, oos_val_data, oos_test_data, split_dir):
    datasets = {
        'train_sentences.pkl': train_data['Title'].tolist(),
        'train_labels.pkl': train_data['Tag1'].tolist(),
        'val_sentences.pkl': val_data['Title'].tolist(),
        'val_labels.pkl': val_data['Tag1'].tolist(),
        'test_sentences.pkl': test_data['Title'].tolist(),
        'test_labels.pkl': test_data['Tag1'].tolist(),
        'oos_val_sentences.pkl': oos_val_data['Title'].tolist(),
        'oos_test_sentences.pkl': oos_test_data['Title'].tolist(),
    }
    
    for filename, data in datasets.items():
        with open(os.path.join(split_dir, filename), 'wb') as file:
            pickle.dump(data, file)
    
    metadata = f"Train size: {len(train_data)}\nValidation size: {len(val_data)}\nTest size: {len(test_data)}\nOOS Validation size: {len(oos_val_data)}\nOOS Test size: {len(oos_test_data)}"
    with open(os.path.join(split_dir, 'metadata.txt'), 'w') as metafile:
        metafile.write(metadata)

# Load and filter the data
file_path = 'stack_overflow.csv'
main_dir = 'stackoverflow_data'
os.makedirs(main_dir, exist_ok=True)

for split_num in range(1, 6):
    split_num = 5
    data = pd.read_csv(file_path)
    data = data[data['Tag1'].isin(['svn', 'oracle', 'bash', 'apache', 'excel', 'matlab', 'cocoa', 'visual-studio', 'osx', 'wordpress', 'spring', 'hibernate', 'scala', 'sharepoint', 'ajax', 'drupal', 'qt', 'haskell', 'linq', 'magento'])]
    data = data[data['Tag1'].apply(lambda x: isinstance(x, str))]
    
    threshold = len(data) * 0.75
    tag_counts = data['Tag1'].value_counts()
    tag_counts_shuffled = tag_counts.sample(frac=1, random_state=split_num)
    
    cumulative_count = 0
    selected_tags = []
    
    for tag, count in tag_counts_shuffled.items():
        cumulative_count += count
        selected_tags.append(tag)
        if cumulative_count >= threshold:
            break
    
    in_domain_data = data[data['Tag1'].isin(selected_tags)]
    out_of_domain_data = data[~data['Tag1'].isin(selected_tags)]
    
    train_data, temp_data = train_test_split(in_domain_data, test_size=0.3, random_state=split_num)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=split_num)
    oos_val_data, oos_test_data = train_test_split(out_of_domain_data, test_size=0.5, random_state=split_num)
    
    # Create directory for the current split
    split_dir = os.path.join(main_dir, f'split{split_num}')
    os.makedirs(split_dir, exist_ok=True)
    
    # Save the split data and metadata
    save_split_data(train_data, val_data, test_data, oos_val_data, oos_test_data, split_dir)


In [18]:
all_tags =['svn', 'oracle', 'bash', 'apache', 'excel', 'matlab', 'cocoa', 'visual-studio', 'osx', 'wordpress', 'spring', 'hibernate', 'scala', 'sharepoint', 'ajax', 'drupal', 'qt', 'haskell', 'linq', 'magento']

In [20]:
set(all_tags)- set(selected_tags)

{'ajax', 'apache', 'drupal', 'qt', 'visual-studio'}