In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [None]:
def calculate_outliers(dataframe, column):
    q1 = dataframe[column].quantile(0.25)
    q3 = dataframe[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)]
    return outliers

In [None]:
def print_outlier_counts(dataframe):
    columns_with_no_outliers = []
    for col in dataframe.select_dtypes(include='number').columns:
        outliers = calculate_outliers(dataframe, col)
        if outliers.empty:
            columns_with_no_outliers.append(col)
        else:
            print(f"Outliers in {col}: {outliers.shape[0]}")
    
    if columns_with_no_outliers:
        print(f"No outliers found in columns: {', '.join(columns_with_no_outliers)}")

In [None]:
def print_outlier_data(dataframe, col):
    if col in dataframe.columns:
        outliers = calculate_outliers(dataframe, col)
        if not outliers.empty:
            print(f"\nOutliers in {col}:")
            print(outliers)
        else:
            print(f"No outliers found in {col}")

In [None]:
def plot_feature_by_pos(dataframe, column):
    # Group by POS and calculate mean of feature
    grouped_data = dataframe.groupby('POS')[column].mean()

    # Bar plot
    sns.barplot(x=grouped_data.index, y=grouped_data.values, edgecolor='black')
    # plt.title(f'Mean {column} by POS')
    plt.xlabel(f'Part of Speech (POS)')
    plt.ylabel(f'Mean {column}')
    plt.show()

In [None]:
def print_mean_per_pos(dataframe, column):
    means = dataframe.groupby('POS')[column].mean().round(2)
    print(means)

In [None]:
def calculate_correlation_and_pvalue(dataframe, simlex999, standard_deviation):
    corr, p_value = spearmanr(dataframe[simlex999], dataframe[standard_deviation])
    significance = 'significant' if p_value < 0.05 else 'not significant'
    return {
        'column_1': simlex999,
        'column_2': standard_deviation,
        'correlation': round(corr, 5),
        'p_value': p_value,
        'significance': significance
    }

In [None]:
def plot_scatter_plot(dataframe, column1, column2, x_label, y_label, x_min, x_max, y_min, y_max):
    # Set size
    fig, ax = plt.subplots(figsize=(7, 5))
    
    # Create scatter plot
    sns.scatterplot(x=column1, y=column2, data=dataframe, color='skyblue', edgecolor='black', ax=ax)
    
    # Plot diagonal line
    ax.plot([x_min, x_max], [y_min, y_max], color='red', linestyle='--')

    # Set axis limits
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    
    # Set labels and title
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    
    # Remove grid
    ax.grid(False)

    # Adjust layout
    plt.tight_layout()

    # Show results
    plt.show()