# Setup

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Install nextflow
! pip install nextflow
! pip install scikit-learn

Project Structure

```sh
penguin_analysis/
├── main.nf
├── nextflow.config
├── data/
│   └── penguins.csv
├── bin/
│   ├── data_cleaning.py
│   ├── species_analysis.py
│   └── visualization.py
└── results/
```

Setup directories

In [2]:
%%bash
mkdir -p data
mkdir -p bin
mkdir -p results

write `data_cleaning.py` file

In [3]:
%%writefile bin/data_cleaning.py
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description='Clean Palmer Penguins dataset')
    parser.add_argument('input_file', help='Input CSV file path')
    parser.add_argument('output_file', help='Output CSV file path')
    return parser.parse_args()

def load_data(file_path):
    return pd.read_csv(file_path)

def clean_column_names(df):
    return df.rename(columns=lambda x: x.lower().replace(' ', '_'))

def remove_missing_values(df):
    return df.dropna()

def normalize_numeric_features(df):
    numeric_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

def validate_data(df):
    # Check for valid ranges
    numeric_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    for col in numeric_cols:
        df = df[df[col].between(df[col].quantile(0.01), df[col].quantile(0.99))]
    
    # Validate species names
    valid_species = ['Adelie', 'Gentoo', 'Chinstrap']
    df = df[df['species'].isin(valid_species)]
    
    return df

def add_derived_features(df):
    # Add bill ratio feature
    df['bill_ratio'] = df['bill_length_mm'] / df['bill_depth_mm']
    
    # Add size category
    df['size_category'] = pd.qcut(df['body_mass_g'], q=3, labels=['small', 'medium', 'large'])
    
    return df

def main():
    args = parse_args()
    
    # Load and process data
    df = load_data(args.input_file)
    df = clean_column_names(df)
    df = remove_missing_values(df)
    df = validate_data(df)
    df = normalize_numeric_features(df)
    df = add_derived_features(df)
    
    # Save cleaned data
    df.to_csv(args.output_file, index=False)
    print(f"Cleaned data saved to {args.output_file}")
    print(f"Shape of cleaned dataset: {df.shape}")

if __name__ == "__main__":
    main()

Writing bin/data_cleaning.py


Write `species_analysis.py`

In [4]:
%%writefile bin/species_analysis.py
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description='Analyze penguin species')
    parser.add_argument('input_file', help='Input CSV file path')
    parser.add_argument('species', help='Species to analyze')
    parser.add_argument('output_dir', help='Output directory for results')
    return parser.parse_args()

def load_and_filter_data(file_path, species):
    df = pd.read_csv(file_path)
    return df[df['species'] == species]

def calculate_basic_stats(df):
    numeric_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    stats_df = df[numeric_cols].agg(['mean', 'std', 'min', 'max'])
    return stats_df

def analyze_sexual_dimorphism(df):
    numeric_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    dimorphism_stats = {}
    
    for col in numeric_cols:
        male_data = df[df['sex'] == 'male'][col]
        female_data = df[df['sex'] == 'female'][col]
        
        t_stat, p_value = stats.ttest_ind(male_data, female_data)
        effect_size = (male_data.mean() - female_data.mean()) / np.sqrt((male_data.var() + female_data.var()) / 2)
        
        dimorphism_stats[col] = {
            't_statistic': t_stat,
            'p_value': p_value,
            'effect_size': effect_size
        }
    
    return pd.DataFrame(dimorphism_stats)

def create_morphological_plots(df, output_dir, species):
    # Distribution plots
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle(f'Morphological Distributions - {species}')
    
    numeric_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    for ax, col in zip(axes.flat, numeric_cols):
        sns.boxplot(data=df, x='sex', y=col, ax=ax)
        ax.set_title(col)
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/{species}_distributions.png')
    plt.close()
    
    # Correlation matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
    plt.title(f'Feature Correlations - {species}')
    plt.savefig(f'{output_dir}/{species}_correlations.png')
    plt.close()

def main():
    args = parse_args()
    
    # Load and process data
    df = load_and_filter_data(args.input_file, args.species)
    
    # Calculate statistics
    basic_stats = calculate_basic_stats(df)
    dimorphism_stats = analyze_sexual_dimorphism(df)
    
    # Generate plots
    create_morphological_plots(df, args.output_dir, args.species)
    
    # Save results
    basic_stats.to_csv(f'{args.output_dir}/{args.species}_basic_stats.csv')
    dimorphism_stats.to_csv(f'{args.output_dir}/{args.species}_dimorphism_stats.csv')
    
    print(f"Analysis completed for {args.species}")
    print(f"Results saved in {args.output_dir}")

if __name__ == "__main__":
    main()

Writing bin/species_analysis.py


write `visualization.py`

In [5]:
%%writefile bin/visualization.py
#!/usr/bin/env python3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description='Create penguin visualizations')
    parser.add_argument('input_file', help='Input CSV file path')
    parser.add_argument('output_dir', help='Output directory for visualizations')
    return parser.parse_args()

def create_species_comparison(df, output_dir):
    # Create faceted boxplots
    fig = plt.figure(figsize=(15, 10))
    measurements = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    for idx, measure in enumerate(measurements, 1):
        plt.subplot(2, 2, idx)
        sns.boxplot(data=df, x='species', y=measure, hue='sex')
        plt.xticks(rotation=45)
        plt.title(f'{measure} by Species and Sex')
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/species_comparison_boxplots.png', dpi=300)
    plt.close()

def create_interactive_scatter(df, output_dir):
    # Interactive scatter plot with Plotly
    fig = px.scatter(df, 
                    x='bill_length_mm', 
                    y='bill_depth_mm',
                    color='species',
                    symbol='sex',
                    size='body_mass_g',
                    hover_data=['flipper_length_mm'],
                    title='Bill Measurements by Species')
    
    fig.write_html(f'{output_dir}/interactive_scatter.html')

def create_correlation_heatmap(df, output_dir):
    # Correlation heatmap
    numeric_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    corr = df[numeric_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Penguin Measurements')
    plt.savefig(f'{output_dir}/correlation_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_density_plots(df, output_dir):
    # Kernel Density Estimation plots
    fig = plt.figure(figsize=(15, 10))
    measurements = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    
    for idx, measure in enumerate(measurements, 1):
        plt.subplot(2, 2, idx)
        for species in df['species'].unique():
            subset = df[df['species'] == species]
            sns.kdeplot(data=subset, x=measure, label=species)
        plt.title(f'{measure} Distribution')
        plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/density_plots.png', dpi=300)
    plt.close()

def create_pair_plot(df, output_dir):
    # Pair plot for all numeric variables
    sns.pairplot(df, hue='species', diag_kind='kde')
    plt.savefig(f'{output_dir}/pair_plot.png', dpi=300)
    plt.close()

def create_island_distribution(df, output_dir):
    # Stacked bar chart of species distribution by island
    plt.figure(figsize=(10, 6))
    species_by_island = pd.crosstab(df['island'], df['species'])
    species_by_island.plot(kind='bar', stacked=True)
    plt.title('Species Distribution by Island')
    plt.xlabel('Island')
    plt.ylabel('Count')
    plt.legend(title='Species')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/island_distribution.png', dpi=300)
    plt.close()

def main():
    args = parse_args()
    
    # Create output directory if it doesn't exist
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
    
    # Load data
    df = pd.read_csv(args.input_file)
    
    # Create visualizations
    create_species_comparison(df, args.output_dir)
    create_interactive_scatter(df, args.output_dir)
    create_correlation_heatmap(df, args.output_dir)
    create_density_plots(df, args.output_dir)
    create_pair_plot(df, args.output_dir)
    create_island_distribution(df, args.output_dir)
    
    print(f"Visualizations saved to {args.output_dir}")

if __name__ == "__main__":
    main()

Writing bin/visualization.py


# EDA (non nextflow)

In [11]:
input_file = "data/penguins_size.csv"
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [12]:
#Check the statistics of numerical features
df.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [13]:
#Check the values of categorical features
# Identify categorical columns (e.g., dtype == 'object' or 'category')
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

# Display unique values for each categorical column
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f"Unique values in '{col}': {unique_values}")

Unique values in 'species': ['Adelie' 'Chinstrap' 'Gentoo']
Unique values in 'island': ['Torgersen' 'Biscoe' 'Dream']
Unique values in 'sex': ['MALE' 'FEMALE' nan '.']


# Write channel with one item

In [53]:
%%writefile bin/penguins.nf
#!/usr/bin/env nextflow

params.data = '/home/zach/projects/psb2025-workshop/penguin_analysis/data/penguins_size.csv'
params.cleaning_script = '/home/zach/projects/psb2025-workshop/penguin_analysis/bin/data_cleaning.py'
params.analysis_script = '/home/zach/projects/psb2025-workshop/penguin_analysis/bin/species_analysis.py'

process clean_data {
    publishDir "${launchDir}/data/"
    input:
        path cleaning_script
        path raw_input
        
    output:
        path 'penguins_cleaned.csv'
        
    script:
    """
    python  ${cleaning_script} --input_file ${raw_input}
    """
}

process species_analysis {
    publishDir "${launchDir}/results/"
    input:
        val species
        path analysis_script
        path cleaned_data
        
    output:
        path "${species}_basic_stats.csv"
        path "${species}_correlations.png"
        path "${species}_dimorphism_stats.csv"
        path "${species}_distributions.png"
    
    script:
        """
        python ${analysis_script} --input_file ${cleaned_data} --species ${species} 
        """
}


workflow {
    // create a species channel
    species_channel = Channel.from('Adelie', 'Gentoo', 'Chinstrap')
    raw_data = "${params.data}"
    
    // clean the data
    cleaning_script = "${params.cleaning_script}"
    cleaned_data = clean_data(cleaning_script, raw_data)
    
    // run the analysis
    analysis_script = "${params.analysis_script}"
    species_analysis(species_channel, analysis_script, cleaned_data)
}

Overwriting bin/penguins.nf


In [55]:
! nextflow run bin/penguins.nf

[33mNextflow 24.10.3 is available - Please consider updating your version to it[m

[1m[38;5;232m[48;5;43m N E X T F L O W [0;2m  ~  [mversion 24.04.2[m
[K
Launching[35m `bin/penguins.nf` [0;2m[[0;1;36mdeadly_davinci[0;2m] DSL2 - [36mrevision: [0;36mc80f39d07f[m
[K
[2m[[0;34m-        [0;2m] [0;2m[mclean_data       -[K
[2m[[0;34m-        [0;2m] [0;2m[mspecies_analysis -[K
[3A
[2mexecutor >  local (1)[m[K
[2m[[0;34m87/4acf72[0;2m] [0;2m[mclean_data      [2m |[m 0 of 1[K
[2m[[0;34m-        [0;2m] [0;2m[mspecies_analysis -[K
[4A
[2mexecutor >  local (1)[m[K
[2m[[0;34m87/4acf72[0;2m] [0;2m[mclean_data      [2m |[m 0 of 1[K
[2m[[0;34m-        [0;2m] [0;2m[mspecies_analysis -[K
[4A
[2mexecutor >  local (4)[m[K
[2m[[0;34m87/4acf72[0;2m] [0;2m[mclean_data          [2m |[m 1 of 1[32m ✔[m[K
[2m[[0;34mda/e898b6[0;2m] [0;2m[mspecies_analysis[33;2m ([0;33m3[2m)[m[2m |[m 0 of 3[K
[4A
[2mexecutor >  local (4)[