In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm

# Load the data
df = pd.read_csv("data/featured_clinvar_result.csv")

In [16]:
# Part 1: Descriptive Statistics
print("=== Basic Dataset Information ===")
print(f"Dataset shape: {df.shape}")
print("\n=== First few rows ===")
print(df.head())

print("\n=== Data Types ===")
print(df.dtypes)

print("\n=== Missing Values ===")
print(df.isnull().sum())

print("\n=== Summary Statistics for Numerical Variables ===")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(df[numeric_cols].describe())

# Check distribution of categorical variables
print("\n=== Variant Type Distribution ===")
if 'type' in df.columns:
    print(df['type'].value_counts())

print("\n=== Label Distribution ===")
if 'label' in df.columns:
    print(df['label'].value_counts())


=== Basic Dataset Information ===
Dataset shape: (1215, 23)

=== First few rows ===
   Unnamed: 0   position  alignment_score  mc_synonymous_variant  \
0           0  154021863               92                      0   
1           1  154021863               80                      0   
2           2  154022510              110                      0   
3           3  154022618              101                      0   
4           4  154024527              110                      0   

   mc_3_prime_UTR_variant  mc_5_prime_UTR_variant  mc_splice_donor_variant  \
0                       1                       0                        0   
1                       1                       0                        0   
2                       1                       0                        0   
3                       1                       0                        0   
4                       1                       0                        0   

   mc_splice_acceptor_variant  mc_nons

In [17]:

# Set up the plotting environment
plt.figure(figsize=(15, 10))
sns.set(style="whitegrid")

# Plot 1: Distribution of alignment scores
plt.subplot(2, 2, 1)
if 'alignment_score' in df.columns:
    sns.histplot(df['alignment_score'], kde=True)
    plt.title('Distribution of Alignment Scores')
    plt.xlabel('Alignment Score')
    plt.ylabel('Frequency')

# Plot 2: Distribution of splice distances
plt.subplot(2, 2, 2)
if 'splice_distance' in df.columns:
    sns.histplot(df['splice_distance'], kde=True)
    plt.title('Distribution of Splice Distances')
    plt.xlabel('Splice Distance')
    plt.ylabel('Frequency')

# Plot 3: Boxplot comparing alignment scores across variant types
plt.subplot(2, 2, 3)
if 'type' in df.columns and 'alignment_score' in df.columns:
    sns.boxplot(x='type', y='alignment_score', data=df)
    plt.title('Alignment Score by Variant Type')
    plt.xticks(rotation=45)

# Plot 4: Boxplot comparing alignment scores across labels
plt.subplot(2, 2, 4)
if 'label' in df.columns and 'alignment_score' in df.columns:
    sns.boxplot(x='label', y='alignment_score', data=df)
    plt.title('Alignment Score by Label')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('basic_distributions.png')
plt.close()

In [18]:

# Create correlation matrix for numerical variables
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

In [19]:

# Function to perform statistical tests between groups
def group_comparison(df, group_col, measure_col):
    """Perform statistical comparison between groups"""
    groups = df[group_col].unique()
    if len(groups) == 2:  # For binary comparisons, use t-test
        group1 = df[df[group_col] == groups[0]][measure_col].dropna()
        group2 = df[df[group_col] == groups[1]][measure_col].dropna()
        t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
        print(f"T-test comparing {measure_col} between {groups[0]} and {groups[1]}")
        print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
        return t_stat, p_value
    elif len(groups) > 2:  # For multiple groups, use ANOVA
        groups_data = [df[df[group_col] == group][measure_col].dropna() for group in groups]
        f_stat, p_value = stats.f_oneway(*groups_data)
        print(f"ANOVA test for {measure_col} across {group_col} groups")
        print(f"F-statistic: {f_stat:.4f}, p-value: {p_value:.4f}")
        return f_stat, p_value
    else:
        print(f"Not enough groups in {group_col} for comparison")
        return None, None

# Perform group comparisons if the relevant columns exist
print("\n=== Group Comparisons ===")
if 'type' in df.columns and 'alignment_score' in df.columns:
    group_comparison(df, 'type', 'alignment_score')

if 'label' in df.columns and 'alignment_score' in df.columns:
    group_comparison(df, 'label', 'alignment_score')

if 'label' in df.columns and 'splice_distance' in df.columns:
    group_comparison(df, 'label', 'splice_distance')



=== Group Comparisons ===
ANOVA test for alignment_score across type groups
F-statistic: 103.1465, p-value: 0.0000
T-test comparing alignment_score between 0 and 1
t-statistic: 7.2401, p-value: 0.0000
T-test comparing splice_distance between 0 and 1
t-statistic: 5.4410, p-value: 0.0000


In [20]:
# Scatter plot of position vs alignment score
plt.figure(figsize=(10, 6))
if 'pos' in df.columns and 'alignment_score' in df.columns:
    plt.scatter(df['pos'], df['alignment_score'], alpha=0.5)
    plt.title('Position vs Alignment Score')
    plt.xlabel('Position')
    plt.ylabel('Alignment Score')
    plt.savefig('position_vs_score.png')
    plt.close()

<Figure size 1000x600 with 0 Axes>

In [21]:
# Relationship between splice distance and variant effect
if 'splice_distance' in df.columns:
    variant_effect_cols = [col for col in df.columns if col.startswith('mc_')]
    
    if variant_effect_cols:
        plt.figure(figsize=(12, 8))
        
        for i, effect_col in enumerate(variant_effect_cols[:4]):  # Plot up to 4 effects
            if i < 4:
                plt.subplot(2, 2, i+1)
                sns.scatterplot(x='splice_distance', y=effect_col, data=df)
                plt.title(f'Splice Distance vs {effect_col}')
        
        plt.tight_layout()
        plt.savefig('splice_distance_vs_effects.png')
        plt.close()

In [22]:

print("\nAnalysis complete! All visualizations have been saved as PNG files.")


Analysis complete! All visualizations have been saved as PNG files.
