In [None]:
import pandas as pd
import numpy as np
from google.colab import files

In [None]:
uploaded = files.upload()
file_name = next(iter(uploaded))  # Gets the first uploaded filename
print(f"Uploaded: {file_name}")

# Load the dataset
df = pd.read_csv(file_name)
print(f"Shape: {df.shape}")
df.head()

Saving pseudomonas_aeruginosa.csv to pseudomonas_aeruginosa.csv
Uploaded: pseudomonas_aeruginosa.csv
Shape: (1000, 8)


Unnamed: 0,ID,Name,Sequence,Molecular_Weight,Isoelectric_Point,Protein_Length,Amino_Acid_Composition,Hydrophobicity
0,WP_369686368.1,ATP-binding cassette domain-containing protein...,MLELNFTQTLGSHTLTLNETLPASGITAIFGVSGAGKTSLINAISG...,5756.543,8.517644,56,"{'M': 1, 'L': 8, 'E': 2, 'N': 3, 'F': 2, 'T': ...",0.339286
1,WP_369686367.1,"aldehyde dehydrogenase family protein, partial...",MQSRDNGKPLAEARGLVMSAAGTARYFAAACELLDGELPTPRQPDR...,6617.5065,6.106918,62,"{'M': 2, 'Q': 2, 'S': 3, 'R': 6, 'D': 3, 'N': ...",-0.146774
2,WP_369686366.1,"hypothetical protein, partial [Pseudomonas aer...",GGEYLEIIEAARDIRVELDAHNYISNILTKLGIDRPSGLTRVMDLA...,9303.2892,4.533444,81,"{'G': 4, 'E': 8, 'Y': 3, 'L': 9, 'I': 6, 'A': ...",-0.406173
3,WP_369686365.1,"hypothetical protein, partial [Pseudomonas aer...",NAVVNQKRVPLAPNGDMLAPGQEKTLSFSGDITRIADIAYTTINDF...,6304.0708,9.989715,58,"{'N': 5, 'A': 5, 'V': 4, 'Q': 3, 'K': 5, 'R': ...",-0.591379
4,WP_369686364.1,homocysteine S-methyltransferase family protei...,MAGYLPQWLDAGAKLIGGCCRTTPQDIAALTVQR,3619.1997,7.810425,34,"{'M': 1, 'A': 5, 'G': 4, 'Y': 1, 'L': 4, 'P': ...",0.141176


In [None]:
# Basic info
print("=== Data Types ===")
print(df.dtypes)

print("\n=== Missing Values ===")
print(df.isnull().sum())

# Quick stats
df.describe(include='all')

=== Data Types ===
ID                         object
Name                       object
Sequence                   object
Molecular_Weight          float64
Isoelectric_Point         float64
Protein_Length              int64
Amino_Acid_Composition     object
Hydrophobicity            float64
dtype: object

=== Missing Values ===
ID                        0
Name                      0
Sequence                  0
Molecular_Weight          0
Isoelectric_Point         0
Protein_Length            0
Amino_Acid_Composition    0
Hydrophobicity            0
dtype: int64


Unnamed: 0,ID,Name,Sequence,Molecular_Weight,Isoelectric_Point,Protein_Length,Amino_Acid_Composition,Hydrophobicity
count,1000,1000,1000,1000.0,1000.0,1000.0,1000,1000.0
unique,1000,106,1000,,,,996,
top,WP_171885212.1,conjugal transfer nickase/helicase domain-cont...,GLAQHAWEWPRINAGYFAPTYAQIRDIFYPTMEEVAFDWGLRTKIN...,,,,"{'E': 8, 'V': 10, 'L': 13, 'I': 9, 'P': 7, 'T'...",
freq,1,234,1,,,,2,
mean,,,,40594.466956,7.138719,364.777,,-0.350712
std,,,,48668.398237,1.82928,444.1324,,0.243274
min,,,,2461.0237,4.050028,22.0,,-1.241379
25%,,,,14632.10605,5.466229,129.75,,-0.493217
50%,,,,27240.1569,6.708843,246.0,,-0.367021
75%,,,,48589.447675,8.746007,431.0,,-0.224514


In [None]:
fill_rules = {
    'Molecular_Weight': df['Molecular_Weight'].median(),
    'Isoelectric_Point': df['Isoelectric_Point'].median(),
    'Protein_Length': df['Protein_Length'].median(),
    'Hydrophobicity': df['Hydrophobicity'].mean(),
    'Amino_Acid_Composition': str({'A':0}),  # Default empty composition
    'Sequence': 'X',  # Unknown residue placeholder
    'Name': 'Uncharacterized protein'
}

df = df.fillna(fill_rules)

In [None]:
def cap_protein_outliers(series, iqr_multiplier=3):
    """Robust outlier capping for biological features"""
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    lower = max(q1 - iqr_multiplier*iqr, 0)  # Physical limits
    upper = q3 + iqr_multiplier*iqr
    return series.clip(lower, upper)

# Apply to numeric columns
numeric_cols = ['Molecular_Weight', 'Isoelectric_Point', 'Protein_Length', 'Hydrophobicity']
for col in numeric_cols:
    df[col] = cap_protein_outliers(df[col])
    print(f"{col} capped between {df[col].min():.2f}-{df[col].max():.2f}")

Molecular_Weight capped between 2461.02-150461.47
Isoelectric_Point capped between 4.05-12.00
Protein_Length capped between 22.00-1334.75
Hydrophobicity capped between 0.00-0.58


In [None]:
valid_aas = set("ACDEFGHIKLMNPQRSTVWY")
df['Sequence'] = df['Sequence'].str.upper().apply(
    lambda s: ''.join(c if c in valid_aas else 'X' for c in s)
)

# Add sequence length validation
df['Seq_Length'] = df['Sequence'].str.len()
discrepancy = df[df['Protein_Length'] != df['Seq_Length']]
print(f"{len(discrepancy)} sequences have length mismatches")

47 sequences have length mismatches


In [None]:
# Molecular weight sanity check
df['MW_Calc'] = df['Sequence'].apply(
    lambda s: sum(monoisotopic_weights[aa] for aa in s) - (len(s)-1)*18.01528)
mw_discrepancy = df[abs(df['Molecular_Weight'] - df['MW_Calc']) > 100]
print(f"{len(mw_discrepancy)} entries with MW discrepancies > 100Da")

# Isoelectric point range validation
df = df[(df['Isoelectric_Point'] >= 2.5) & (df['Isoelectric_Point'] <= 12)]

NameError: name 'monoisotopic_weights' is not defined

In [None]:
# Define a dictionary with monoisotopic weights of amino acids and water
monoisotopic_weights = {
    'A': 71.03711, 'R': 156.10111, 'N': 114.04293, 'D': 115.02694, 'C': 103.00919,
    'E': 129.04259, 'Q': 128.05858, 'G': 57.02146, 'H': 137.05891, 'I': 113.08406,
    'L': 113.08406, 'K': 128.09496, 'M': 131.04049, 'F': 147.06841, 'P': 97.05276,
    'S': 87.03203, 'T': 101.04768, 'W': 186.07931, 'Y': 163.06333, 'V': 99.06841,
    'X': 0 # Placeholder for unknown residues
}

# Molecular weight sanity check
df['MW_Calc'] = df['Sequence'].apply(
    lambda s: sum(monoisotopic_weights[aa] for aa in s) - (len(s)-1)*18.01528)
mw_discrepancy = df[abs(df['Molecular_Weight'] - df['MW_Calc']) > 100]
print(f"{len(mw_discrepancy)} entries with MW discrepancies > 100Da")

# Isoelectric point range validation
df = df[(df['Isoelectric_Point'] >= 2.5) & (df['Isoelectric_Point'] <= 12)]

1000 entries with MW discrepancies > 100Da


In [None]:
# Save as CSV (for GitHub)
output_file = 'cleaned_data.csv'
df.to_csv(output_file, index=False)

# Download to local machine
files.download(output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>