In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
from scipy.stats import linregress

In [2]:
def plot_corr_double(primary_df, secondary_df, fields, corr_type):
    # Compute for two datasets the correlation of two fields and combine it in single array
    correlations_1 = primary_df[fields].corr(method=corr_type)
    correlations_2 = secondary_df[fields].corr(method=corr_type)
    combined_correlations = np.zeros_like(correlations_1)
    combined_correlations[np.triu_indices_from(combined_correlations)] = correlations_1.values[np.triu_indices_from(correlations_1)]
    combined_correlations[np.tril_indices_from(combined_correlations, k=-1)] = correlations_2.values[np.tril_indices_from(correlations_2, k=-1)]
    print(correlations_1, '\n',  '\n',correlations_2)

    # Plot this array as a heatmap
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(np.arange(len(correlations_1.columns)+1)-0.5, np.arange(len(correlations_1.index)+1)-0.5, color='black', linewidth=2)
    im = ax.imshow(combined_correlations, cmap='viridis', vmin=0, vmax=1, interpolation='nearest')
    cbar = fig.colorbar(im, ax=ax, label='Correlation Coefficient', ticks=np.linspace(0, 1, 11))
    ax.set_xticks(range(len(combined_correlations)),labels = fields, rotation = 90)
    ax.set_yticks(range(len(combined_correlations)),labels = fields, rotation = 0)
    ax.xaxis.tick_top()
    ax.set_xlabel('4th Grade of ESO')
    ax.set_ylabel('6th Grade of primary')
    ax.yaxis.set_label_position('right')
    plt.tight_layout()

In [3]:
def plot_corr_single(df, fields,corr_type):
    # Compute the correlation of two fields and combine as an array
    correlations = df[fields].corr(method=corr_type)
    print(correlations)
    np.fill_diagonal(correlations.values, np.nan)
    
    # Plot this array as a heatmap
    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(correlations, cmap='viridis', vmin=0.5, vmax=0.8, interpolation='nearest')
    cbar = fig.colorbar(im, ax=ax, label='Correlation Coefficient', ticks=np.linspace(0.5, 0.8, 11))
    ax.set_xticks(range(len(correlations)),labels = fields, rotation = 90)
    ax.set_yticks(range(len(correlations)),labels = fields, rotation = 0)
    plt.tight_layout()

In [5]:
def plot_boxes_double(primary_df, secondary_df, fields): 
    #Plot the the marks for different subjects as a box plot for both primary and secondary
    positions_primary = np.arange(1, len(fields) + 1)
    positions_secondary = np.arange(1.25, len(fields) + 1.25)
    data_to_plot = []
    labels = []
    fig, ax = plt.subplots()
    
    for i in range(len(fields)):
        data_to_plot.append(primary_df[fields[i]].dropna().values)
        labels.append(f"{fields[i]}")
    for i in range(len(fields)):
        data_to_plot.append(secondary_df[fields[i]].dropna().values)
        labels.append(f"{''}")
    ax.boxplot(data_to_plot, positions=np.concatenate((positions_primary, positions_secondary)), labels=labels, widths = 0.2 )
        
    ax.tick_params(axis='x', which='both', bottom=False, top=False)
    ax.set_ylabel('Values')
    plt.tight_layout()
    plt.show

In [6]:
def plot_violin_double(primary_df, secondary_df, fields):
    #Plot the distribution of marks for different subjects as a violin plot for both primary and secondary
    warnings.filterwarnings("ignore")
    p_df = primary_df[fields] 
    p_df['Grade'] = 'primary'
    s_df = secondary_df[fields] 
    s_df['Grade'] = 'secondary'
    combined_data = pd.concat([p_df, s_df], axis=0)
    melted_data = pd.melt(combined_data[:], id_vars=['Grade'], value_vars=fields, var_name='Subject', value_name='Marks')
    sns.violinplot(x='Subject', y='Marks', hue='Grade', data=melted_data, split=True, palette='viridis', cut=1, inner='quart')
    
    plt.ylabel('Values')
    plt.xlabel(None)

    handles, labels = plt.gca().get_legend_handles_labels()
    unique_labels = ['primary', 'secondary']
    unique_handles = [h for i, h in enumerate(handles) if labels[i] in unique_labels]

    plt.legend(unique_handles, unique_labels, title='Grade', loc = 3)
    plt.tight_layout()
    plt.ylim(0,100)
    y = np.arange(0, 100, 10)
    plt.yticks(y)
    plt.grid(axis='y')
    plt.show()

In [7]:
def plot_corr_dif_double(primary_df, secondary_df, fields,corr_type):
    # Compute for two datasets the correlation of two fields
    correlations_1 = primary_df[fields].corr(method=corr_type)
    correlations_2 = secondary_df[fields].corr(method=corr_type)
    # Compute the difference between these correlations in a single array
    for col in correlations_1:
        correlations_1[col] = correlations_1[col] - correlations_2[col]
        correlations_2[col] = -correlations_1[col]
    combined_correlations = np.zeros_like(correlations_1)
    combined_correlations[np.triu_indices_from(combined_correlations)] = correlations_1.values[np.triu_indices_from(correlations_1)]
    combined_correlations[np.tril_indices_from(combined_correlations, k=-1)] = correlations_2.values[np.tril_indices_from(correlations_2, k=-1)]
    print(correlations_1, '\n',  '\n',correlations_2)
    np.fill_diagonal(combined_correlations, np.nan)

    #Plot the corresponding array as a heatmap
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(np.arange(len(correlations_1.columns)+1)-0.5, np.arange(len(correlations_1.index)+1)-0.5, color='black', linewidth=2)
    im = ax.imshow(combined_correlations, cmap='viridis', vmin=-0.2, vmax=0.2, interpolation='nearest')
    cbar = fig.colorbar(im, ax=ax, label='Correlation Coefficient difference', ticks=np.linspace(-0.2, 0.2, 11))
    ax.set_xticks(range(len(combined_correlations)),labels = fields, rotation = 90)
    ax.set_yticks(range(len(combined_correlations)),labels = fields, rotation = 0)
    ax.xaxis.tick_top()
    ax.set_xlabel('ESO-Primary')
    ax.set_ylabel('Primary-Eso')
    ax.yaxis.set_label_position('right')
    plt.tight_layout()

In [18]:
def plot_scatter_years(df, fields, years, legend, corr_type):
    # Plot the evolution of the correlation across the corresponding years 
    with sns.color_palette("viridis"):
        grouped = df.groupby('ANY')
        correlation = np.zeros(len(years))
        for i in range(len(fields)):
            for j in range(i + 1, len(fields)):
                field1 = fields[i]
                field2 = fields[j]
                for k in range(len(years)):
                    year_data = grouped.get_group(years[k])
                    correlation[k] = year_data[[field1, field2]].corr(method=corr_type).iloc[0, 1] 
                plt.scatter(years, correlation, label=str(field1)+'/'+str(field2))
                #Compute and plot the corresponding linear regression
                slope, intercept, _, _, _ = linregress(years, correlation) 
                regression_line = slope * years + intercept
                plt.plot(years, regression_line, linestyle='--')
                
        if legend is True:
            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')