In [1]:
import pandas as pd
import numpy as np
from scipy.stats import stats, chi2_contingency
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pingouin as pg

In [4]:
import pandas as pd

def find_related_group(df, features, corr_threshold=0.6):
    """Identifies a group of features that are correlated above a threshold.

    Args:
        df (pandas.DataFrame): The DataFrame containing the features.
        features (list): A list of feature names to start with.
        corr_threshold (float): The minimum correlation for inclusion in the group.

    Returns:
        list: A list of correlated feature names (including the starting features).
    """

    corr_matrix = df[features].corr()
    related = []
    for feat in features:
        related += [f for f in corr_matrix[feat].index if abs(corr_matrix.loc[feat, f]) >= corr_threshold and f != feat]
    return list(set(related + features))  # Ensure unique features


In [5]:
import numpy as np
from scipy.stats import pointbiserialr

def point_biserial_correlation(categorical, numerical):
    """Calculates the point-biserial correlation coefficient.

    Args:
        categorical (array-like): Binary categorical variable.
        numerical (array-like): Continuous numerical variable.

    Returns:
        float: Point-biserial correlation coefficient.
    """

    corr, p_value = pointbiserialr(categorical, numerical)
    return corr  


In [6]:
def analyze_correlation(df, feature1, feature2):
    """
    Performs correlation analysis between two features, handling numerical, 
    categorical, and mixed data types, generating hypotheses, and choosing plots.

    Args:
        df (pandas.DataFrame): The DataFrame containing the features.
        feature1 (str): Name of the first feature.
        feature2 (str): Name of the second feature.

    Returns:
        None (Outputs results directly)
    """

    # Data extraction and basic checks
    data1 = df[feature1]
    data2 = df[feature2]

    if not pd.api.types.is_numeric_dtype(data1) or not pd.api.types.is_numeric_dtype(data2):
        print("Error: Features must be numeric or categorical.")
        return

    # Categorical vs Categorical 
    if pd.api.types.is_categorical_dtype(data1) and pd.api.types.is_categorical_dtype(data2):
        contingency = pd.crosstab(data1, data2)
        chi2, p, dof, expected = chi2_contingency(contingency)
        print("Chi-Square Statistic:", chi2)
        print("p-value:", p)
        cramer_v = pg.compute_effsize(contingency, eftype='cramer') 
        print("Cramer's V:", cramer_v)
        print("Consider creating a contingency table/heatmap.")

    # Categorical vs Numerical
    elif pd.api.types.is_categorical_dtype(data1) and pd.api.types.is_numeric_dtype(data2):
        print("Consider using box plots to compare distributions of the numerical feature across categories")
        print("Point-Biserial Correlation:")
        print(point_biserial_correlation(data1, data2))

    # Both Numeric: 
    else:  
        # Correlation, Visualization, and Regression
        correlation, p_value = stats.pearsonr(data1, data2)
        print(f"Pearson's Correlation Coefficient: {correlation:.4f}")
        print(f"p-value: {p_value:.4f}")

        if p_value < 0.05:  
            print("The correlation is statistically significant.")
            # Scatter Plot with Regression Line
            plt.scatter(data1, data2)
            m, c = np.polyfit(data1, data2, 1)  
            plt.plot(data1, m * data1 + c, color='red')  
            plt.xlabel(feature1)
            plt.ylabel(feature2)
            plt.title(f"Correlation between {feature1} and {feature2}")
            plt.show()

            # Hypothesis based on strength
            if abs(correlation) >= 0.8:
                print("The correlation is strong. Consider linear regression.")
            else:
                print("The correlation is moderate or weak. Explore curve fitting.")

    # Hypothesis: Feature Clustering
    if feature1 in df.columns and feature2 in df.columns:
        related_features = find_related_group(df, [feature1, feature2]) 
        if len(related_features) > 2:
             print(f"Potential Feature Cluster: {related_features}")
             plot_pca(df[related_features]) 

In [7]:
df = pd.read_csv('supermarket_sales.csv')

In [None]:
df.info()

In [9]:
analyze_correlation(df, 'Total', 'Gender')

Error: Features must be numeric or categorical.
