In [None]:
import pandas as pd

folds = pd.read_csv('folds_val.csv', sep = '\t')
val_folds_validated = pd.read_csv('val_folds_validated.csv', sep='\t')

In [None]:
print(len(val_folds_validated))
print(len(folds))

#examples = folds[folds['fold']=="examples"]

In [None]:
print(folds['sentence'].head())
print(val_folds_validated['sentence'].head())
folds['validated'] = val_folds_validated['validated']

In [None]:
folds = folds[folds['relevance_manual'] == folds['validated']]

In [None]:
#examples = folds[folds['fold'] != fold]
examples = folds

In [None]:
print(examples.columns)
pd.set_option('display.max_colwidth', None)

In [None]:
print(examples)

# Select examples for binary task

In [None]:
print(examples[examples['relevance_manual'] == 1])

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA

# Assuming df is already defined as your DataFrame with Dutch sentences and labels

# Extract sentences and labels
sentences = examples['sentence'].values
labels = examples['relevance_manual'].values

# Convert labels to numeric (including np.nan as a distinct label)
unique_labels = pd.Series(labels).unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
numeric_labels = np.array([label_mapping[label] for label in labels])

# ========== Feature Extraction ==========
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences).toarray()

# Reduce dimensionality for better clustering (optional)
pca = PCA(n_components=10, random_state=42)
X_reduced = pca.fit_transform(X)  # Now X_reduced is defined

# Adjusted functions from previous code
def similarity_selection(X, y, k):
    """Selects samples that are most similar to the centroid of the class."""
    centroids = []
    for i in np.unique(y):
        if np.sum(y == i) > 0:
            centroid = np.mean(X[y == i], axis=0)
            centroids.append(centroid)
        else:
            centroids.append(None)
    
    similarities = []
    for i in range(len(centroids)):
        if centroids[i] is not None:
            sim = np.linalg.norm(X[y == i] - centroids[i], axis=1)
            most_similar_idx = np.argsort(sim)[:min(k, len(sim))]
            similarities.append(most_similar_idx)
    
    return np.concatenate(similarities) if similarities else np.array([])

def diversity_selection_simple(X, y, k):
    """Selects the most diverse samples based on pairwise distances."""
    diverse_samples_idx = []
    for i in np.unique(y):
        if np.sum(y == i) > 0:
            pairwise_dists = pairwise_distances(X[y == i])
            diverse_samples = np.argsort(np.mean(pairwise_dists, axis=1))[-min(k, len(pairwise_dists)):]
            diverse_samples_idx.append(diverse_samples)
    
    return np.concatenate(diverse_samples_idx) if diverse_samples_idx else np.array([])

def learnability_selection(X, y, k):
    """Simulates learnability by selecting samples with lower variance in feature space."""
    variances = np.var(X, axis=1)
    learnable_samples_idx = []
    for i in np.unique(y):
        if np.sum(y == i) > 0:
            learnable_samples_idx.append(np.argsort(variances[y == i])[:min(k, len(variances[y == i]))])
    
    return np.concatenate(learnable_samples_idx) if learnable_samples_idx else np.array([])

def acsess(X, y, weights, k):
    """Combine the different strategies with specified weights."""
    combined_scores = np.zeros(len(X))
    
    # Similarity selection
    sim_idx = similarity_selection(X, y, k)
    combined_scores[sim_idx] += weights['similarity']
    
    # Diversity selection (using simple distance-based method)
    div_idx = diversity_selection_simple(X, y, k)
    combined_scores[div_idx] += weights['diversity']
    
    # Learnability selection
    learn_idx = learnability_selection(X, y, k)
    combined_scores[learn_idx] += weights['learnability']
    
    # Select top K samples for each class based on combined scores
    top_samples_idx = []
    for i in np.unique(y):
        class_indices = np.where(y == i)[0]
        if len(class_indices) > 0:
            class_scores = combined_scores[class_indices]
            top_samples_idx.append(class_indices[np.argsort(class_scores)[-min(k, len(class_scores)):]])
    
    return np.concatenate(top_samples_idx) if top_samples_idx else np.array([])

# Initialize an empty DataFrame to store the results
df_bin_examples = pd.DataFrame(columns=["example", "label", "k"])

# Example usage of ACSESS with simple diversity
weights = {'similarity': 0.3, 'diversity': 0.4, 'learnability': 0.3}

for k in range(1, 6):
    selected_samples_idx = acsess(X_reduced, numeric_labels, weights, k)
    selected_sentences = [sentences[i] for i in selected_samples_idx]
    selected_labels = numeric_labels[selected_samples_idx]

    # Create a temporary DataFrame for this value of k
    temp_df = pd.DataFrame({
        "example": selected_sentences,
        "label": [unique_labels[label] for label in selected_labels],
        "k": k
    })
    
    # Append the temporary DataFrame to the results DataFrame
    df_bin_examples = pd.concat([df_bin_examples, temp_df], ignore_index=True)


# Selection examples classes

In [None]:
examples_classes = examples[examples['relevance_manual'] == 1]

In [None]:
print(len(examples_classes))
print(examples_classes['manual_sentence_labels'].value_counts())

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA

# Assuming df is already defined as your DataFrame with Dutch sentences and labels

# Extract sentences and labels
sentences = examples_classes['sentence'].values
labels = examples_classes['manual_sentence_labels'].values

# Convert labels to numeric (including np.nan as a distinct label)
unique_labels = pd.Series(labels).unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
numeric_labels = np.array([label_mapping[label] for label in labels])

# ========== Feature Extraction ==========
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences).toarray()

# Reduce dimensionality for better clustering (optional)
pca = PCA(n_components=10, random_state=42)
X_reduced = pca.fit_transform(X)  # Now X_reduced is defined

# Adjusted functions from previous code
def similarity_selection(X, y, k):
    """Selects samples that are most similar to the centroid of the class."""
    centroids = []
    for i in np.unique(y):
        if np.sum(y == i) > 0:
            centroid = np.mean(X[y == i], axis=0)
            centroids.append(centroid)
        else:
            centroids.append(None)
    
    similarities = []
    for i in range(len(centroids)):
        if centroids[i] is not None:
            sim = np.linalg.norm(X[y == i] - centroids[i], axis=1)
            most_similar_idx = np.argsort(sim)[:min(k, len(sim))]
            similarities.append(most_similar_idx)
    
    return np.concatenate(similarities) if similarities else np.array([])

def diversity_selection_simple(X, y, k):
    """Selects the most diverse samples based on pairwise distances."""
    diverse_samples_idx = []
    for i in np.unique(y):
        if np.sum(y == i) > 0:
            pairwise_dists = pairwise_distances(X[y == i])
            diverse_samples = np.argsort(np.mean(pairwise_dists, axis=1))[-min(k, len(pairwise_dists)):]
            diverse_samples_idx.append(diverse_samples)
    
    return np.concatenate(diverse_samples_idx) if diverse_samples_idx else np.array([])

def learnability_selection(X, y, k):
    """Simulates learnability by selecting samples with lower variance in feature space."""
    variances = np.var(X, axis=1)
    learnable_samples_idx = []
    for i in np.unique(y):
        if np.sum(y == i) > 0:
            learnable_samples_idx.append(np.argsort(variances[y == i])[:min(k, len(variances[y == i]))])
    
    return np.concatenate(learnable_samples_idx) if learnable_samples_idx else np.array([])

def acsess(X, y, weights, k):
    """Combine the different strategies with specified weights."""
    combined_scores = np.zeros(len(X))
    
    # Similarity selection
    sim_idx = similarity_selection(X, y, k)
    combined_scores[sim_idx] += weights['similarity']
    
    # Diversity selection (using simple distance-based method)
    div_idx = diversity_selection_simple(X, y, k)
    combined_scores[div_idx] += weights['diversity']
    
    # Learnability selection
    learn_idx = learnability_selection(X, y, k)
    combined_scores[learn_idx] += weights['learnability']
    
    # Select top K samples for each class based on combined scores
    top_samples_idx = []
    for i in np.unique(y):
        class_indices = np.where(y == i)[0]
        if len(class_indices) > 0:
            class_scores = combined_scores[class_indices]
            top_samples_idx.append(class_indices[np.argsort(class_scores)[-min(k, len(class_scores)):]])
    
    return np.concatenate(top_samples_idx) if top_samples_idx else np.array([])

# Initialize an empty DataFrame to store the results
df_class_examples = pd.DataFrame(columns=["example", "label", "k"])

# Example usage of ACSESS with simple diversity
weights = {'similarity': 0.3, 'diversity': 0.4, 'learnability': 0.3}

for k in range(1, 6):
    selected_samples_idx = acsess(X_reduced, numeric_labels, weights, k)
    selected_sentences = [sentences[i] for i in selected_samples_idx]
    selected_labels = numeric_labels[selected_samples_idx]

    # Create a temporary DataFrame for this value of k
    temp_df = pd.DataFrame({
        "example": selected_sentences,
        "label": [unique_labels[label] for label in selected_labels],
        "k": k
    })
    
    # Append the temporary DataFrame to the results DataFrame
    df_class_examples = pd.concat([df_class_examples, temp_df], ignore_index=True)


In [None]:
#df_bin_examples.to_csv('df_bin_examples_acsess.csv', sep='\t')
#df_class_examples.to_csv('df_class_examples_acsess.csv', sep='\t')

In [None]:
#df_bin_examples.to_csv(f'ACSESS examples per fold/df_bin_examples_acsess_{fold}.csv', sep='\t')
#df_class_examples.to_csv(f'ACSESS examples per fold/df_class_examples_acsess_{fold}.csv', sep='\t')

In [None]:
df_bin_examples.to_csv('df_bin_examples_acsess_for_test.csv', sep='\t')
df_class_examples.to_csv('df_class_examples_acsess_for_test.csv', sep='\t')