In [1]:
!pip install pandas-profiling  # Optional for EDA
import pandas as pd
import numpy as np
from google.colab import files

Collecting pandas-profiling
  Downloading pandas_profiling-3.2.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting joblib~=1.1.0 (from pandas-profiling)
  Downloading joblib-1.1.1-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting markupsafe~=2.1.1 (from pandas-profiling)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting visions==0.7.4 (from visions[type_image_path]==0.7.4->pandas-profiling)
  Downloading visions-0.7.4-py3-none-any.whl.metadata (5.9 kB)
Collecting htmlmin>=0.1.12 (from pandas-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik>=0.11.1 (from pandas-profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting tangled-up-in-unicode==0.2.0 (from pandas-profiling)
  Downloading tangled_up_in_unicode-0.2.0-py3-none-any.whl.metadata (4.8 kB)
Collecting multimethod>=1.4 (from p

In [2]:
# Upload from your local machine
uploaded = files.upload()
file_name = next(iter(uploaded))  # Gets the first uploaded filename
print(f"Uploaded: {file_name}")

# Load the dataset
df = pd.read_csv(file_name)
print(f"Shape: {df.shape}")
df.head()

Saving pdb_data_seq.csv to pdb_data_seq.csv
Uploaded: pdb_data_seq.csv
Shape: (467304, 5)


Unnamed: 0,structureId,chainId,sequence,residueCount,macromoleculeType
0,100D,A,CCGGCGCCGG,20,DNA/RNA Hybrid
1,100D,B,CCGGCGCCGG,20,DNA/RNA Hybrid
2,101D,A,CGCGAATTCGCG,24,DNA
3,101D,B,CGCGAATTCGCG,24,DNA
4,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154,Protein


In [3]:
print("=== Data Types ===")
print(df.dtypes)

print("\n=== Missing Values ===")
print(df.isnull().sum())

# Quick stats
df.describe(include='all')

=== Data Types ===
structureId          object
chainId              object
sequence             object
residueCount          int64
macromoleculeType    object
dtype: object

=== Missing Values ===
structureId              0
chainId                 10
sequence                28
residueCount             0
macromoleculeType    34817
dtype: int64


Unnamed: 0,structureId,chainId,sequence,residueCount,macromoleculeType
count,467304,467294,467276,467304.0,432487
unique,140250,2836,104812,,13
top,3J3Q,A,PIVQNLQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEG...,,Protein
freq,1356,135187,5112,,345180
mean,,,,6173.42662,
std,,,,23645.559738,
min,,,,0.0,
25%,,,,456.0,
50%,,,,1131.0,
75%,,,,4342.0,


In [5]:
# 1. Handle Missing Values

# For chainId
df['chainId'] = df['chainId'].fillna('Unknown_Chain')

# For sequence
df['sequence'] = df['sequence'].fillna('SEQ_NOT_AVAILABLE')


def infer_molecule_type(row):
    if pd.isna(row['macromoleculeType']):
        if isinstance(row['sequence'], str) and len(row['sequence']) > 0:
            return 'Protein'  # Default assumption for sequences
        return 'Unknown'
    return row['macromoleculeType']

df['macromoleculeType'] = df.apply(infer_molecule_type, axis=1)

# Handle zero values (if they exist)
df['residueCount'] = df['residueCount'].replace(0, np.nan)

# Cap outliers using IQR method
Q1 = df['residueCount'].quantile(0.25)
Q3 = df['residueCount'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df['residueCount'] = df['residueCount'].clip(lower=lower_bound, upper=upper_bound)

# 3. Validate Structure IDs
# -------------------------

# Check for duplicate structure-chain combinations
dup_check = df.duplicated(subset=['structureId', 'chainId'], keep=False)
if dup_check.any():
    print(f"Found {dup_check.sum()} duplicate structure-chain entries")
    # Option to deduplicate:
    # df = df.drop_duplicates(subset=['structureId', 'chainId'])

# 4. Final Verification
# ---------------------

print("\n=== After Cleaning ===")
print("Missing Values:")
print(df.isnull().sum())

print("\nData Types:")
print(df.dtypes)

print("\nResidue Count Statistics:")
print(df['residueCount'].describe())


=== After Cleaning ===
Missing Values:
structureId           0
chainId               0
sequence              0
residueCount         25
macromoleculeType     0
dtype: int64

Data Types:
structureId           object
chainId               object
sequence              object
residueCount         float64
macromoleculeType     object
dtype: object

Residue Count Statistics:
count    467279.000000
mean       2931.891615
std        3467.028029
min           2.000000
25%         456.000000
50%        1131.000000
75%        4342.000000
max       10171.000000
Name: residueCount, dtype: float64


In [6]:
df['residueCount'] = df['residueCount'].fillna(df['residueCount'].median())

In [7]:
print(f"Before: {df.shape}")
df = df.drop_duplicates()
print(f"After: {df.shape}")


Before: (467304, 5)
After: (467304, 5)


In [10]:
#Add sequence length column
df['seqLength'] = df['sequence'].str.len().fillna(0)

# Validate residueCount against actual sequence length
df['residueCount'] = np.where(
    df['seqLength'] > 0,
    df[['residueCount', 'seqLength']].min(axis=1),
    df['residueCount']
)

In [11]:
# Breakdown of molecule types
print("\nMacromolecule Type Distribution:")
print(df['macromoleculeType'].value_counts(dropna=False))

# Option to fill remaining unknowns
df['macromoleculeType'] = df['macromoleculeType'].fillna('Unknown')


Macromolecule Type Distribution:
macromoleculeType
Protein                       379997
Protein#RNA                    56226
Protein#DNA                    21303
DNA                             3784
Protein#DNA#RNA                 2712
RNA                             2389
Protein#RNA#DNA/RNA Hybrid       304
Protein#DNA#DNA/RNA Hybrid       159
DNA/RNA Hybrid                   141
DNA#RNA                          121
RNA#DNA/RNA Hybrid                74
Protein#DNA/RNA Hybrid            68
DNA#DNA/RNA Hybrid                26
Name: count, dtype: int64


In [14]:
# Save as CSV
output_file = 'cleaned_data.csv'
df.to_csv(output_file, index=False)

# Download to local machine
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>