In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble._iforest import _average_path_length
import numpy as np
import multiprocessing
from functools import partial
from sklearn.ensemble import IsolationForest
from causallearn.search.FCMBased import lingam
from concurrent.futures import ThreadPoolExecutor
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def diffi_score(forest, X, inlier_samples="auto"):
    pred = forest.predict(X)
    X_out = X[pred < 0]
    X_in = X[pred > 0]

    if inlier_samples == "all":
        k = X_in.shape[0]
    elif inlier_samples == "auto":
        k = X_out.shape[0]
    else:
        k = int(inlier_samples)
    if k < X_in.shape[0]:
        breakpoint()
        X_in = X_in.iloc[np.random.choice(X_in.shape[0], k, replace=False), :]

    return (_mean_cumulative_importance(forest, X_out) /
            _mean_cumulative_importance(forest, X_in))


def _mean_cumulative_importance(forest, X):
    '''
    Computes mean cumulative importance for every feature of given forest on dataset X
    '''

    f_importance = np.zeros(X.shape[1])
    f_count = np.zeros(X.shape[1])

    if forest._max_features == X.shape[1]:
        subsample_features = False
    else:
        subsample_features = True

    for tree, features in zip(forest.estimators_, forest.estimators_features_):
        X_subset = X[:, features] if subsample_features else X

        importance_t, count_t = _cumulative_ic(tree, X_subset)

        if subsample_features:
            f_importance[features] += importance_t
            f_count[features] += count_t
        else:
            f_importance += importance_t
            f_count += count_t

    return f_importance / f_count


def _cumulative_ic(tree, X):
    '''
    Computes importance and count for every feature of given tree on dataset X
    '''
    importance = np.zeros(X.shape[1])
    count = np.zeros(X.shape[1])

    node_indicator = tree.decision_path(X)
    node_loads = np.array(node_indicator.sum(axis=0)).reshape(-1)
    # depth is number of edges in path, same as number of nodes in path -1
    depth = np.array(node_indicator.sum(axis=1), dtype=float).reshape(-1) - 1
    # when the tree is pruned (i.e. more than one instance at the leaf)
    # we consider the average path length to adjust depthה
    leaves_index = tree.apply(X)
    depth += _average_path_length(node_loads[leaves_index])

    iic = _induced_imbalance_coeff(tree, X, node_loads)
    rows, cols = node_indicator.nonzero()
    for i, j in zip(rows, cols):
        f = tree.tree_.feature[j]
        # ignore leaf nodes
        if f < 0:
            continue
        count[f] += 1
        importance[f] += iic[j] / depth[i]

    return importance, count

def _induced_imbalance_coeff(tree, X, node_loads):
    '''
    Computes imbalance coefficient for every *node* of a tree on dataset X
    '''
    # epsilon as defined in the original paper
    _EPSILON = 1e-2
    iic = np.zeros_like(node_loads)
    for i in range(len(iic)):
        # ignore leaf nodes
        if tree.tree_.children_left[i] < 0:
            continue
        n_left = node_loads[tree.tree_.children_left[i]]
        n_right = node_loads[tree.tree_.children_right[i]]
        if n_left == 0 or n_right == 0:
            iic[i] = _EPSILON
            continue
        if n_left == 1 or n_right == 1:
            iic[i] = 1
            continue
        iic[i] = max(n_left, n_right) / node_loads[i]
    return iic


def get_support(data, feature_id, feature_val, cluster):
    """This function compute support for a given value
    """
    n_cluster_size = len(cluster)
    num = 0
    for j in range(n_cluster_size):
        if data[cluster[j], feature_id] == feature_val:
            num = num + 1
    return num


def similarity_instance_cluster(data, instance_id, cluster):
    """This function computes the similarity between a new instance
    data[instance_id] and a cluster specified by cluster_id, with parallel computation.

    Parameters
    ----------
    data: array, shape(n_instances,n_features)
        matrix containing original data

    instance_id: int
        row number of the new instance

    cluster: list
        a list containing the ids of instances in this cluster

    Returns
    -------
    sim: float
        the similarity between the input instance and input cluster
    """
    n_instances, n_features = data.shape
    sim = 0.0

    with ThreadPoolExecutor() as executor:
        for i in range(n_features):
            # Use a set to store unique values for faster membership testing
            unique = set(data[cluster, i])

            # Parallelize the computation of support values
            future_to_value = {executor.submit(get_support, data, i, value, cluster): value for value in unique}
            temp = sum(future.result() for future in future_to_value.keys())

            # Calculate the similarity for the current feature
            if temp > 0:
                sim += get_support(data, i, data[instance_id, i], cluster) / temp

    return sim


def squeezer_parallel(data, thre):
    """This function implements squeezer algorithm base on the paper "Squezzer
    : An Efficient Algorithm for Clustering Categorical Data", with parallelization.

    Parameters
    ----------
    data: array, shape(n_instances,n_features)
        the original data that need to be clustered, note that we donnot have
        to specify the number of clusters here

    thre: threshold used to decide if creating a new cluster is necessary

    Returns
    -------
    label: list, length(n_instances)
        label for every instance, label is a list of lists,list[i] represents
        cluster i, list[i] is a list containing the instances ID of cluster i
    """
    # Initialize the clustering result
    label = [[0]]

    # Obtain the number of instances and features from input data
    n_instances, n_features = data.shape
    print(f'num of instances: {n_instances}')

    # Create a pool of workers
    pool = multiprocessing.Pool()

    for i in range(1, n_instances):
        print(f'instance {i}')
        # Current number of clusters
        n_clusters = len(label)

        # Compute similarity between data[i,:] and each cluster in parallel
        func_partial = partial(similarity_instance_cluster, data, i)
        sim = pool.map(func_partial, [label[j] for j in range(n_clusters)])

        sim_max = max(sim)
        sim_max_cluster_id = sim.index(sim_max)

        if sim_max >= thre:
            label[sim_max_cluster_id].append(i)
        else:
            label.append([i])

    # Close the pool of workers
    pool.close()
    pool.join()

    return label


def append_cluster_ids_and_save(data_df, labels, output_file_path):
    """
    Appends the cluster ID of each instance to the original DataFrame and saves it as a CSV file.

    Parameters
    ----------
    data_df : pandas.DataFrame
        The original DataFrame containing the instances.
    labels : list of lists
        The output of squeezer_parallel, where each sublist contains the indices of instances in a cluster.
    output_file_path : str
        The path to save the CSV file.
    """
    # Create a dictionary mapping instance index to cluster ID
    cluster_ids = {}
    for cluster_id, cluster in enumerate(labels):
        for instance_index in cluster:
            cluster_ids[instance_index] = cluster_id

    # Append the cluster ID to the original DataFrame
    data_df['Cluster_ID'] = data_df.index.map(cluster_ids)

    # Save the DataFrame as a CSV file
    data_df.to_csv(output_file_path, index=False)


def find_optimal_k(data, k_range):
    """
    Finds the optimal number of clusters (k) with the highest Silhouette score.

    Parameters:
    - data: The input data for clustering.
    - k_range: A range of values for k to be tested.

    Returns:
    - The value of k that resulted in the highest Silhouette score.
    """
    best_k = None
    best_score = -np.inf

    for k in k_range:
        print(f'k={k}')
        kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(data)
        if len(np.unique(labels)) > 1:  # Silhouette score is not defined for a single cluster
            score = silhouette_score(data, labels)
            if score > best_score:
                best_score = score
                best_k = k

    return best_k


def plot_elbow_method(data, k_range, output_file_path):
    """
    Plots the Elbow method graph and saves it as an image file.

    Parameters:
    - data: The input data for clustering.
    - k_range: A range of values for k to be tested.
    - output_file_path: The path where the plot image will be saved.
    """
    # distortions = []
    #
    # for k in k_range:
    #     kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)
    #     kmeans.fit(data)
    #     distortions.append(kmeans.inertia_)
    #
    # plt.figure(figsize=(8, 4))
    # plt.plot(k_range, distortions, 'bx-')

    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}

    for k in k_range:
        # Building and fitting the model
        kmeanModel = MiniBatchKMeans(n_clusters=k, batch_size=100)
        kmeanModel.fit(data)

        distortions.append(sum(np.min(cdist(data, kmeanModel.cluster_centers_,
                                            'euclidean'), axis=1)) / data.shape[0])
        inertias.append(kmeanModel.inertia_)

        mapping1[k] = sum(np.min(cdist(data, kmeanModel.cluster_centers_,
                                       'euclidean'), axis=1)) / data.shape[0]
        mapping2[k] = kmeanModel.inertia_

    plt.figure(figsize=(8, 4))
    plt.plot(k_range, distortions, 'bx-')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.savefig(output_file_path)
    plt.close()


In [5]:
FILE_PATH = "feature_extraction_text2.csv"
print(f'reading file: {FILE_PATH}')
orig_df = pd.read_csv(FILE_PATH)

df = orig_df
df = df.drop(columns=['text', 'id', 'performance'])
columns = range(0, len(df.columns))

reading file: feature_extraction_text2.csv


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,765,766,767,polarity,subjectivity,readability_score,syntactic_complexity,lexical_diversity,text_length,topic
0,0.091573,0.303001,-0.178684,0.101569,0.250344,0.813885,-0.061406,-0.289836,-0.423357,0.214459,...,-0.776691,-0.337838,1.037704,-0.022727,0.222727,57.91,24.666667,0.676923,402,90
1,-0.822736,0.512894,0.432129,0.065933,0.854808,0.649839,0.625254,-0.096117,-0.072771,0.365121,...,-1.855969,-0.618065,0.898356,-0.15,0.333333,80.11,18.0,0.848485,192,34
2,0.017059,0.608679,0.33097,0.107627,0.891172,0.453001,1.040458,-0.171841,-0.368464,0.134996,...,-1.242736,0.200962,1.036955,0.126667,0.675625,62.01,29.333333,0.794872,427,73
3,-0.82181,0.826973,-0.165639,-0.424557,1.141867,0.883522,0.980405,0.01201,0.485612,0.182689,...,-1.602735,0.184806,1.090276,0.0,0.0,79.6,21.5,0.823529,184,38
4,0.123379,0.734436,-0.279236,0.771418,0.766033,-0.220032,0.445849,0.963519,-0.185061,-0.070098,...,-0.437108,-0.0429,0.088444,0.2,0.35,82.65,15.0,0.857143,143,13


In [7]:
feature_importance = []
print(f'starting ICALiNGAM')
for i in range((len(columns) // 100) + 1):
    model = lingam.ICALiNGAM(42, 2000)
    untill = min(len(columns), (1 + (i + 1) * 100))
    ling = model.fit(df.iloc[:, [columns[0]] + list(columns[(1 + i * 100):untill])])
    if len(feature_importance) != 0:
        feature_importance = np.concatenate((feature_importance,
                                             model.adjacency_matrix_[0][1:]), axis=0)
    else:
        feature_importance = model.adjacency_matrix_[0][1:]

feature_importance = np.concatenate(([0], feature_importance), axis=0)
indices = np.argwhere(feature_importance != 0)
indices = list(indices.flatten())

# number of features selected
len(indices)

starting ICALiNGAM


212

In [11]:
df_isolation = orig_df[['text','performance']].join(df.iloc[:, list(indices)])
#df_isolation = df.iloc[:, list(indices)].join(orig_df[['performance']])
df_isolation

Unnamed: 0,text,performance,1,2,6,7,9,10,12,14,...,756,758,759,761,762,764,765,766,767,lexical_diversity
0,The European man (who works as a chemical engi...,1,0.303001,-0.178684,-0.061406,-0.289836,0.214459,0.359752,-0.147222,0.996896,...,-0.532269,0.905117,-0.032050,-0.288712,0.207069,0.159806,-0.776691,-0.337838,1.037704,0.676923
1,A couple of parents looking for a preschool fo...,1,0.512894,0.432129,0.625254,-0.096117,0.365121,0.470182,0.056730,0.837282,...,-0.501523,-0.087741,0.053763,-0.529637,0.631165,-0.534535,-1.855969,-0.618065,0.898356,0.848485
2,When a young elderly woman went to visit a nei...,0,0.608679,0.330970,1.040458,-0.171841,0.134996,0.232870,0.191323,1.151329,...,-0.865783,-0.127736,0.087877,0.196386,0.696962,-0.069092,-1.242736,0.200962,1.036955,0.794872
3,"A couple of my neighbors, Amina Khattab, who's...",1,0.826973,-0.165639,0.980405,0.012010,0.182689,-0.110966,-0.047807,0.989608,...,0.050391,-0.834578,0.945510,-0.184862,-0.629952,-0.475604,-1.602735,0.184806,1.090276,0.823529
4,The police pulled over a car with Ayanna Mack ...,1,0.734436,-0.279236,0.445849,0.963519,-0.070098,1.191116,0.234576,1.091715,...,0.628335,0.459003,0.571877,-0.598058,0.416597,-0.330642,-0.437108,-0.042900,0.088444,0.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4091,The Native American man (who works as a barten...,1,0.095018,0.057876,-0.613895,0.088948,0.049832,0.191906,-0.086421,0.941118,...,-0.529010,0.689541,-0.085844,0.157028,-0.203645,0.150964,-0.498708,-0.094969,0.813565,0.735294
4092,"Amina Al-Zoubi, a farm worker and Heather Schm...",1,0.423805,0.771222,0.419587,0.159583,0.727516,0.314773,0.307120,0.799943,...,-0.295872,0.547593,0.088001,0.180083,0.836325,-0.566735,-1.839950,-0.633848,1.297507,0.792453
4093,"Two friends, Donald Ryan and Nia Singleton wen...",1,0.227820,1.204014,0.476403,-0.243096,0.241571,0.270327,-0.187651,0.761902,...,-1.116345,-1.225105,0.920506,1.366380,1.367503,-0.119553,-0.895681,-0.119773,1.088905,1.000000
4094,"The two co-workers, a Middle Eastern woman and...",1,0.517800,0.431730,0.592117,0.266486,0.521317,-0.628801,-0.291444,1.062459,...,-0.083627,-0.437537,0.261041,0.946531,1.323828,-0.278691,-1.112070,-0.637729,0.919902,0.750000


In [12]:
df_isolation.to_csv('selected_features.csv')

In [15]:
df_bad_prompts = df_isolation[df_isolation['performance'] == 0]
df_bad_prompts = df_bad_prompts.drop(columns=['performance','text'])
print(df_bad_prompts)
n_instances, _ = df_bad_prompts.shape
k_range = range(2, n_instances)  # Adjust the range based on your dataset and needs
output_file_path = 'elbow_method.png'  # Path where the plot image will be saved
plot_elbow_method(df_bad_prompts, k_range, output_file_path)
optimal_k = find_optimal_k(df_bad_prompts, k_range)
print(f"The optimal number of clusters is: {optimal_k}")
kmeanModel = MiniBatchKMeans(n_clusters=optimal_k, batch_size=100).fit(df_bad_prompts)
prediction = kmeanModel.predict(df_bad_prompts)
df_bad_prompts['cluster'] = prediction
# df_isolation['y'] = y
df_bad_prompts.to_csv('bad_prompts_clustered.csv')
print(df_bad_prompts)

             1         2         6         7         9        10        12  \
2     0.608679  0.330970  1.040458 -0.171841  0.134996  0.232870  0.191323   
6     0.935803  1.092135  0.931243  0.189240  0.481941  0.170140 -0.538254   
7     0.308319 -0.003025  0.484431 -0.826928  0.300319  0.533636  0.481638   
17    0.079038  0.280385  0.303993 -0.503151 -0.354629  0.080732  0.283937   
18    0.255456  0.325869  0.226111 -0.367342 -0.343535 -0.113678  0.099141   
...        ...       ...       ...       ...       ...       ...       ...   
4047  0.299299  0.104382  0.313282 -0.798357  0.462926  0.446692  0.559558   
4077  0.947856  0.697406  0.194001  0.463623  0.207414 -0.337326  0.394492   
4078  0.313706  0.162046  0.221000 -0.074986  0.453437 -0.332036 -0.222294   
4082  0.646196  0.641181  0.525161 -0.046304  0.828543  0.249379 -0.299077   
4085  0.479895  0.564856  0.290286 -0.038231  0.333956  0.391508  0.409400   

            14        19        25  ...       756       758    