In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [None]:
def print_mean_median(dataframe):
    for col in dataframe.select_dtypes(include='number').columns:
        mean_val = round(dataframe[col].mean(), 2)
        median_val = dataframe[col].median()
        print(f"{col} - Mean: {mean_val}, Median: {median_val}")

In [None]:
def plot_boxplot(dataframe, column):

    # Create white grid
    sns.set_theme(style="whitegrid")

    # Creating the boxplot
    plt.figure(figsize=(4, 6))
    sns.boxplot(y=dataframe[column], color='skyblue')

    # Setting labels and title
    plt.ylabel(column)
    # plt.title(f'Boxplot of {column}')

    # Customizing y-axis ticks
    plt.gca().yaxis.set_major_locator(ticker.MaxNLocator(nbins=10))

    # Show the plot
    plt.show()

In [None]:
def print_variance(dataframe):
    for col in dataframe.select_dtypes(include='number').columns:
        print(f"{col} - Variance: {round(dataframe[col].var(), 2)}")

In [None]:
def plot_histogram(dataframe, column):
    # Plotting the histogram
    plt.hist(dataframe[column], bins=10, edgecolor='black', color='skyblue')

    # Setting labels and title
    plt.ylabel("Frequency")
    plt.xlabel(column)
    # plt.title(f'Distribution of {column}')

    # Customizing x-axis ticks and remove grid
    plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    plt.grid(b=None)

    # Display results
    plt.show()

In [None]:
def calculate_outliers(dataframe, column):
    q1 = dataframe[column].quantile(0.25)
    q3 = dataframe[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)]
    return outliers

In [None]:
def print_outlier_counts(dataframe):
    columns_with_no_outliers = []
    for col in dataframe.select_dtypes(include='number').columns:
        outliers = calculate_outliers(dataframe, col)
        if outliers.empty:
            columns_with_no_outliers.append(col)
        else:
            print(f"Outliers in {col}: {outliers.shape[0]}")
    
    if columns_with_no_outliers:
        print(f"No outliers found in columns: {', '.join(columns_with_no_outliers)}")

In [None]:
def print_outlier_data(dataframe, col):
    if col in dataframe.columns:
        outliers = calculate_outliers(dataframe, col)
        if not outliers.empty:
            print(f"\nOutliers in {col}:")
            print(outliers)
        else:
            print(f"No outliers found in {col}")

In [None]:
def plot_feature_by_pos(dataframe, column):
    # Group by POS and calculate mean of feature
    grouped_data = dataframe.groupby('POS')[column].mean()

    # Plotting
    sns.barplot(x=grouped_data.index, y=grouped_data.values, edgecolor='black')
    # plt.title(f'Mean {column} by POS')
    plt.xlabel(f'Part of Speech (POS)')
    plt.ylabel(f'Mean {column}')
    plt.show()

In [None]:
def print_mean_per_pos(dataframe, column):
    means = dataframe.groupby('POS')[column].mean().round(2)
    print(means)

In [None]:
def print_correlation_and_pvalue(dataframe, column1, column2):
    corr, p_value = spearmanr(dataframe[column1], dataframe[column2])
    print(f"Spearman's Rank correlation: {round(corr, 5)}")
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("The correlation is statistically significant.\n")
    else:
        print("The correlation is not statistically significant.\n")

In [None]:
def plot_scatter_plot(dataframe, column1, column2):
    # Set size
    fig, ax = plt.subplots(figsize=(7, 5))
    
    # Create scatter plot
    sns.scatterplot(x=column1, y=column2, data=dataframe, color='skyblue', edgecolor='black', ax=ax)
    
    # Get limits of the plot
    x_limits = ax.get_xlim()
    y_limits = ax.get_ylim()
    
    # Determine minimum and maximum limits
    min_limit = min(x_limits[0], y_limits[0])
    max_limit = max(x_limits[1], y_limits[1])
    
    # Plot diagonal line
    ax.plot([min_limit, max_limit], [min_limit, max_limit], color='red', linestyle='--')
    
    # Set axis limits
    ax.set_xlim(min_limit, max_limit)
    ax.set_ylim(min_limit, max_limit)
    
    # Set labels and title
    ax.set_xlabel(column1)
    ax.set_ylabel(column2)
    
    # Remove grid
    ax.grid(False)

    # Adjust layout
    plt.tight_layout()

    # Show results
    plt.show()