In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import numpy as np

In [45]:
def calculate_pearson_correlation(df, col1, col2):
    return df[[col1, col2]].corr().iloc[0, 1]

# Function to modify columns to achieve a desired correlation
def modify_for_correlation(df, col1, col2, target_correlation):
    current_correlation = abs(calculate_pearson_correlation(df, col1, col2))
    print(f"Initial correlation: {current_correlation}")
    if current_correlation >= target_correlation:
        return df
    while current_correlation < target_correlation and len(df) > 1:
        # Calculate the correlation change for each row removal
        correlation_changes = []
        for index, row in df.iterrows():
            temp_df = df.drop(index)
            temp_corr = abs(calculate_pearson_correlation(temp_df, col1, col2))
            correlation_changes.append((index, temp_corr))

        top_n = 100
        effective_top_n = top_n * (target_correlation - current_correlation)

        sorted_correlation_changes = sorted(correlation_changes, key=lambda x: x[1], reverse=True)
        top_indices = [index for index, _ in sorted_correlation_changes[:top_n]]
        df = df.drop(index=top_indices)
        current_correlation = abs(calculate_pearson_correlation(df, col1, col2))
        print(f'Updated correlation: {current_correlation}, No. indices: {effective_top_n}')
    return df

In [23]:
lfw_df = pd.read_csv("../datasets_data/LFW/lfw_dataset.csv")
lfw_df.head(5)

In [37]:
def plot_feature_distribution_ratio(df):
    features = []
    ratios = []
    column_set = lfw_df.columns.drop(labels=["imagenum", "person", "filename_person", "filename_complete", "Male"])
    for column in column_set:
        # Count the number of samples for each class (0 and 1) for the current feature
        counts = df[column].value_counts(normalize=True)

        # Ensure both 0 and 1 are present in the value_counts, else fill with 0
        zero_count = counts.get(0, 0)
        one_count = counts.get(1, 0)

        # Calculate the ratio
        ratio = one_count / zero_count if zero_count != 0 else np.inf
        features.append(column)
        ratios.append(ratio)

    # Combine the features and ratios into a list of tuples and sort by ratio in descending order
    sorted_pairs = sorted(zip(features, ratios), key=lambda x: x[1], reverse=True)
    sorted_features, sorted_ratios = zip(*sorted_pairs)

    plt.figure(figsize=(10, 6))
    plt.bar(sorted_features, sorted_ratios, color='blue')
    plt.xlabel('Features')
    plt.ylabel('Ratio of 1s to 0s')
    plt.title('Distribution Ratio of LFW Features')
    plt.xticks(rotation=90)
    plt.yscale('log')  # Use a logarithmic scale for better visualization of large ratios
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.savefig("dataset_plots/lfw_target_ratio.png", dpi=200)
    plt.show()
    
plot_feature_distribution_ratio(lfw_df)

In [16]:
from jupyter_notebooks.dataset_analysis.dataset_analysis_utils import plot_target_distribution

plot_target_distribution(lfw_df, "Color Photo")

In [39]:
correlations = {}
column_set = lfw_df.columns.drop(labels=["imagenum", "person", "filename_person", "filename_complete", "Male"])
for column in column_set:
    if column != 'male_gender':
        corr, _ = pearsonr(lfw_df['male_gender'], lfw_df[column])
        correlations[column] = abs(corr)


correlation_df = pd.DataFrame(list(correlations.items()), columns=['Feature', 'Correlation'])
correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)

plt.figure(figsize=(20, 6))
barplot = sns.barplot(x='Feature', y='Correlation', data=correlation_df)
plt.title('Correlation between features and gender')
plt.xlabel('Feature')
plt.xticks(rotation=90)
plt.ylabel('Correlation')

for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'),
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha = 'center', va = 'center',
                     xytext = (0, 9), textcoords = 'offset points')

plt.tight_layout()
plt.savefig("dataset_plots/lfw_correlation.png", dpi=200)
plt.show()

In [43]:
from jupyter_notebooks.dataset_analysis.dataset_analysis_utils import plot_target_distribution

plot_target_distribution(lfw_df, "Strong Nose-Mouth Lines")

In [44]:
from jupyter_notebooks.dataset_analysis.dataset_analysis_utils import plot_protected_feature_distribution

plot_protected_feature_distribution(lfw_df, "Strong Nose-Mouth Lines", "male_gender")

In [46]:
new_df = modify_for_correlation(lfw_df, "Strong Nose-Mouth Lines", "male_gender", 0.8)

In [48]:
for skew_level in [0.2, 0.4, 0.6 , 0.8, 0.99]:
    print(f"Skew generation for skew level: {skew_level}")
    skewed_df = modify_for_correlation(lfw_df, "Strong Nose-Mouth Lines", "male_gender", skew_level)
    skewed_df.to_csv(f"../datasets_data/LFW/lfw_dataset_male__strong_nose_mouth_skewed_{skew_level}.csv")