# SciX: Exploratory Data Analysis

Exploring the antibacterial polymer dataset and validating Shannon Entropy features.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.preprocessing import load_data, preprocess_data
from src.features import add_entropy_features, parse_blocks

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load and Preprocess Data

In [None]:
# Load raw data
df = load_data('../Dataset final scix.xlsx - Dataset_Complete_modified.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Check data types and missing values
print("Data types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Preprocess data
df = preprocess_data(df)

# Check MIC distributions after parsing
print("MIC distributions after preprocessing:")
for col in ['MIC_PAO1', 'MIC_SA', 'MIC_PAO1_PA']:
    print(f"\n{col}:")
    print(f"  Count: {df[col].notna().sum()}")
    print(f"  Min: {df[col].min():.2f}, Max: {df[col].max():.2f}")
    print(f"  Mean: {df[col].mean():.2f}, Median: {df[col].median():.2f}")

## 2. Explore Sequence Structures

In [None]:
# Examine unique block sequences
print("Unique block sequences:")
print(df['block_sequence_theoretical'].value_counts())

In [None]:
# Distribution of number of blocks
plt.figure(figsize=(8, 5))
df['Number of blocks'].value_counts().sort_index().plot(kind='bar', color='steelblue', alpha=0.8)
plt.xlabel('Number of Blocks')
plt.ylabel('Count')
plt.title('Distribution of Block Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Test block parsing
test_sequences = df['block_sequence_theoretical'].unique()[:10]
print("Block parsing examples:")
for seq in test_sequences:
    blocks = parse_blocks(seq)
    print(f"  {seq} -> {blocks}")

## 3. Add and Explore Entropy Features

In [None]:
# Add entropy features
df = add_entropy_features(df)

# Display entropy features
entropy_cols = ['composition_entropy', 'block_entropy', 'sequence_entropy', 'randomness_score']
print("Entropy feature statistics:")
df[entropy_cols].describe()

In [None]:
# Entropy distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, col in zip(axes.flat, entropy_cols):
    ax.hist(df[col], bins=20, color='steelblue', alpha=0.7, edgecolor='black')
    ax.set_xlabel(col.replace('_', ' ').title())
    ax.set_ylabel('Count')
    ax.set_title(f'Distribution of {col.replace("_", " ").title()}')

plt.tight_layout()
plt.show()

In [None]:
# Entropy by number of blocks
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Composition entropy vs blocks
df.boxplot(column='composition_entropy', by='Number of blocks', ax=axes[0])
axes[0].set_title('Composition Entropy by Block Count')
axes[0].set_xlabel('Number of Blocks')
axes[0].set_ylabel('Composition Entropy')

# Randomness score vs blocks
df.boxplot(column='randomness_score', by='Number of blocks', ax=axes[1])
axes[1].set_title('Randomness Score by Block Count')
axes[1].set_xlabel('Number of Blocks')
axes[1].set_ylabel('Randomness Score')

plt.suptitle('')
plt.tight_layout()
plt.show()

## 4. MIC Distributions

In [None]:
# MIC distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

mic_cols = ['MIC_PAO1', 'MIC_SA', 'MIC_PAO1_PA']
colors = ['#e74c3c', '#3498db', '#2ecc71']

for ax, col, color in zip(axes, mic_cols, colors):
    ax.hist(df[col].dropna(), bins=20, color=color, alpha=0.7, edgecolor='black')
    ax.set_xlabel('MIC Value')
    ax.set_ylabel('Count')
    ax.set_title(f'Distribution of {col}')
    ax.axvline(df[col].median(), color='red', linestyle='--', label=f'Median: {df[col].median():.1f}')
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# MIC correlation heatmap
plt.figure(figsize=(8, 6))
mic_corr = df[mic_cols].corr()
sns.heatmap(mic_corr, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Between MIC Targets')
plt.tight_layout()
plt.show()

## 5. Feature Correlations

In [None]:
# Correlation matrix for all features
feature_cols = [
    'composition_A', 'composition_B1', 'composition_B2', 'composition_C',
    'Number of blocks', 'dpn', 'Dispersity', 'cLogP_predicted',
    'composition_entropy', 'block_entropy', 'sequence_entropy', 'randomness_score',
    'MIC_PAO1', 'MIC_SA', 'MIC_PAO1_PA'
]

plt.figure(figsize=(14, 12))
corr_matrix = df[feature_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f',
            annot_kws={'size': 8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Correlations with MIC targets
mic_correlations = df[feature_cols].corr()[mic_cols].drop(mic_cols)

fig, ax = plt.subplots(figsize=(10, 8))
mic_correlations.plot(kind='barh', ax=ax, width=0.8)
ax.set_xlabel('Correlation Coefficient')
ax.set_title('Feature Correlations with MIC Targets')
ax.axvline(0, color='black', linewidth=0.5)
ax.legend(title='Target')
plt.tight_layout()
plt.show()

## 6. Entropy vs MIC Relationships

In [None]:
# Scatter plots: Entropy features vs MIC_PAO1
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, ent_col in zip(axes.flat, entropy_cols):
    scatter = ax.scatter(df[ent_col], df['MIC_PAO1'], 
                        c=df['Number of blocks'], cmap='viridis', 
                        alpha=0.7, edgecolor='k', linewidth=0.5, s=60)
    ax.set_xlabel(ent_col.replace('_', ' ').title())
    ax.set_ylabel('MIC_PAO1')
    ax.set_title(f'{ent_col.replace("_", " ").title()} vs MIC_PAO1')
    
fig.colorbar(scatter, ax=axes, label='Number of Blocks', shrink=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Randomness score vs MIC by number of blocks
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, mic_col in zip(axes, mic_cols):
    for n_blocks in sorted(df['Number of blocks'].unique()):
        subset = df[df['Number of blocks'] == n_blocks]
        ax.scatter(subset['randomness_score'], subset[mic_col], 
                  label=f'{n_blocks} blocks', alpha=0.7, s=60)
    ax.set_xlabel('Randomness Score')
    ax.set_ylabel(mic_col)
    ax.set_title(f'Randomness Score vs {mic_col}')
    ax.legend()

plt.tight_layout()
plt.show()

## 7. Composition Analysis

In [None]:
# Composition distributions
comp_cols = ['composition_A', 'composition_B1', 'composition_B2', 'composition_C']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, col in zip(axes.flat, comp_cols):
    ax.hist(df[col], bins=20, color='steelblue', alpha=0.7, edgecolor='black')
    ax.set_xlabel(col)
    ax.set_ylabel('Count')
    ax.set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
# cLogP vs MIC
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, mic_col, color in zip(axes, mic_cols, colors):
    ax.scatter(df['cLogP_predicted'], df[mic_col], c=color, alpha=0.6, s=60, edgecolor='k', linewidth=0.5)
    ax.set_xlabel('cLogP (Predicted)')
    ax.set_ylabel(mic_col)
    ax.set_title(f'cLogP vs {mic_col}')

plt.tight_layout()
plt.show()

## 8. Summary Statistics

In [None]:
# Full summary
print("Dataset Summary:")
print(f"  Total samples: {len(df)}")
print(f"  Features: {len(feature_cols) - 3} (excluding MIC targets)")
print(f"  Unique block sequences: {df['block_sequence_theoretical'].nunique()}")

print("\nSamples per block count:")
print(df['Number of blocks'].value_counts().sort_index())

print("\nEntropy feature summary:")
print(df[entropy_cols].describe())

In [None]:
# Save processed data
df.to_csv('../data/processed/polymer_data_processed.csv', index=False)
print("Processed data saved to data/processed/polymer_data_processed.csv")