# Spearman's vs Pearson's correlation coefficient - what is the difference?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

In [None]:
# Seet for reproducibility
np.random.seed(42)

# Generate normally distributed x values
x = np.random.normal(loc=5, scale=2, size=100)

In [None]:
plt.hist(x, bins=25, edgecolor='black')
plt.title('Histogram of Normally Distributed x Values')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scenario 1: Linear relationship
y_linear = 2 * x + np.random.normal(0, 2, 100)

# Scenario 2: Monotone exponential relationship
y_exponential = np.exp(.6 * x) + np.random.normal(0, 2, 100)

# Scenario 3: Linear with outliers
y_outliers = 2 * x + np.random.normal(0, 2, 100)
y_outliers[95:] = [80, 85, 90, 92, 95]  # Add extreme outliers

In [None]:
# Calculate correlations for each scenario
scenarios = {
    'Linear': (x, y_linear),
    'Exponential': (x, y_exponential),
    'With Outliers': (x, y_outliers)
}

for name, (x_data, y_data) in scenarios.items():
    pearson_corr, _ = pearsonr(x_data, y_data)
    spearman_corr, _ = spearmanr(x_data, y_data)

In [None]:
# Without correlation coefficients

fig, axes = plt.subplots(3, 2, figsize=(10, 12))

for i, (name, (x_data, y_data)) in enumerate(scenarios.items()):
    # Scatter plot
    axes[i, 0].scatter(x_data, y_data, alpha=0.6)
    axes[i, 0].set_xlabel('X')
    axes[i, 0].set_ylabel('Y')
    axes[i, 0].set_title(f'{name} - Scatter Plot')
    
    
    # Y distribution
    axes[i, 1].hist(y_data, bins=20, edgecolor='black', alpha=0.7, color='orange')
    axes[i, 1].set_xlabel('Y values')
    axes[i, 1].set_ylabel('Frequency')
    axes[i, 1].set_title(f'{name} - Y Distribution')

plt.tight_layout()
plt.show()

# --> What do you think of the correlation coefficients Pearson vs Spearman ?

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10, 12))

for i, (name, (x_data, y_data)) in enumerate(scenarios.items()):
    # Scatter plot
    axes[i, 0].scatter(x_data, y_data, alpha=0.6)
    axes[i, 0].set_xlabel('X')
    axes[i, 0].set_ylabel('Y')
    axes[i, 0].set_title(f'{name} - Scatter Plot')
    
    pearson_corr, pearson_p = pearsonr(x_data, y_data)
    spearman_corr, spearman_p = spearmanr(x_data, y_data)
    
    axes[i, 0].text(0.05, 0.95, 
                    f"Pearson: {pearson_corr:.3f} (p={pearson_p:.3f})\nSpearman: {spearman_corr:.3f} (p={spearman_p:.3f})", 
                    transform=axes[i, 0].transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    # Y distribution
    axes[i, 1].hist(y_data, bins=20, edgecolor='black', alpha=0.7, color='orange')
    axes[i, 1].set_xlabel('Y values')
    axes[i, 1].set_ylabel('Frequency')
    axes[i, 1].set_title(f'{name} - Y Distribution')

plt.tight_layout()
plt.show()