In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import pointbiserialr

df = pd.read_csv("data/featured_clinvar_result.csv")

In [5]:

# === Basic Dataset Summary ===
print("=== Dataset Info ===")
print(f"Shape: {df.shape}\n")
print(df.head())
print("\nMissing values:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

# === Descriptive statistics ===
numeric_cols = df.select_dtypes(include=[np.number]).columns
print("\n=== Descriptive Statistics ===")
print(df[numeric_cols].describe())

# === Categorical summaries ===
if 'type' in df:
    print("\nType distribution:")
    print(df['type'].value_counts())

if 'label' in df:
    print("\nLabel distribution:")
    print(df['label'].value_counts())


=== Dataset Info ===
Shape: (1215, 29)

    position  alignment_score  mc_synonymous_variant  mc_frameshift_variant  \
0  154021863               53                      0                      0   
1  154021863               54                      0                      0   
2  154022510              108                      0                      0   
3  154022618              106                      0                      0   
4  154024527              105                      0                      0   

   mc_3_prime_UTR_variant  mc_5_prime_UTR_variant  mc_splice_donor_variant  \
0                       1                       0                        0   
1                       1                       0                        0   
2                       1                       0                        0   
3                       1                       0                        0   
4                       1                       0                        0   

   mc_splice_acc

In [6]:
# === Visualization Section ===
sns.set(style="whitegrid")
plt.figure(figsize=(15, 10))

# Plot 1: Alignment Score
plt.subplot(2, 2, 1)
sns.histplot(df['alignment_score'], kde=True)
plt.title('Alignment Score Distribution')

# Plot 2: Splice Distance
plt.subplot(2, 2, 2)
sns.histplot(df['splice_distance'], kde=True)
plt.title('Splice Distance Distribution')

# Plot 3: Boxplot by Type
plt.subplot(2, 2, 3)
sns.boxplot(x='type', y='alignment_score', data=df)
plt.title('Alignment Score by Variant Type')

# Plot 4: Boxplot by Label
plt.subplot(2, 2, 4)
sns.boxplot(x='label', y='alignment_score', data=df)
plt.title('Alignment Score by Label')

plt.tight_layout()
plt.savefig('plots/distributions.png')
plt.close()

In [7]:
plt.figure(figsize=(12, 10))
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('plots/correlation_heatmap.png')
plt.close()


In [8]:
# === Statistical Tests ===
def group_comparison(df, group_col, target_col):
    groups = df[group_col].unique()
    if len(groups) == 2:
        t_stat, p = stats.ttest_ind(
            df[df[group_col] == groups[0]][target_col],
            df[df[group_col] == groups[1]][target_col],
            equal_var=False)
        print(f"T-test ({group_col} vs {target_col}): t={t_stat:.3f}, p={p:.4f}")
    elif len(groups) > 2:
        samples = [df[df[group_col] == g][target_col] for g in groups]
        f_stat, p = stats.f_oneway(*samples)
        print(f"ANOVA ({group_col} vs {target_col}): F={f_stat:.3f}, p={p:.4f}")

print("\n=== Statistical Group Comparisons ===")
group_comparison(df, 'label', 'alignment_score')
group_comparison(df, 'label', 'splice_distance')
group_comparison(df, 'type', 'alignment_score')



=== Statistical Group Comparisons ===
T-test (label vs alignment_score): t=7.263, p=0.0000
T-test (label vs splice_distance): t=5.441, p=0.0000
ANOVA (type vs alignment_score): F=104.581, p=0.0000


In [9]:
effect_cols = [col for col in df.columns if col.startswith("mc_")]
print("\n=== Point-Biserial Correlation with Label ===")
for col in effect_cols:
    corr, pval = pointbiserialr(df[col], df['label'])
    print(f"{col}: r={corr:.3f}, p={pval:.4f}")



=== Point-Biserial Correlation with Label ===
mc_synonymous_variant: r=-0.557, p=0.0000
mc_frameshift_variant: r=0.660, p=0.0000
mc_3_prime_UTR_variant: r=-0.223, p=0.0000
mc_5_prime_UTR_variant: r=0.135, p=0.0000
mc_splice_donor_variant: r=0.079, p=0.0057
mc_splice_acceptor_variant: r=0.067, p=0.0196
mc_nonsense: r=0.271, p=0.0000
mc_intron_variant: r=-0.105, p=0.0003
mc_missense_variant: r=-0.055, p=0.0550
mc_stop_lost: r=0.079, p=0.0057


In [10]:
# === Scatter plots ===
if 'position' in df.columns:
    plt.figure(figsize=(10, 6))
    plt.scatter(df['position'], df['alignment_score'], alpha=0.4)
    plt.title('Position vs Alignment Score')
    plt.xlabel('Position')
    plt.ylabel('Alignment Score')
    plt.tight_layout()
    plt.savefig('plots/position_vs_score.png')
    plt.close()

In [11]:
plt.figure(figsize=(12, 8))
for i, col in enumerate(effect_cols[:4]):
    plt.subplot(2, 2, i+1)
    sns.scatterplot(x='splice_distance', y=col, data=df)
    plt.title(f'Splice Distance vs {col}')

plt.tight_layout()
plt.savefig('plots/splice_distance_vs_effects.png')
plt.close()


In [12]:


print("\nAll analysis and plots completed. Files saved in 'plots/' directory.")



All analysis and plots completed. Files saved in 'plots/' directory.
