In [1]:
import pandas as pd
import random
import pickle
from tqdm import tqdm

from google.colab import drive

# 🧪 Step 3: Mount Google Drive
drive.mount('/content/drive')

# === ✅ Configuration ===
positive_csv_path = "/content/drive/MyDrive/RNA-peptide/rna_protein_positive_pairs_full.csv"  # your input .csv
output_csv_path = positive_csv_path.replace(".csv", "_neg.csv")
output_pkl_path = positive_csv_path.replace(".csv", "_neg.pkl")

# === ✅ Load positive data ===
positive_df = pd.read_csv(positive_csv_path)

# Create a set of known positive (protein_seq, rna_seq) combinations
positive_pairs = set((row['protein_seq'], row['rna_seq']) for _, row in positive_df.iterrows())

# Unique proteins and RNAs (with identifiers)
all_proteins = list(positive_df[['protein_seq', 'protein_chain', 'pdb_id']].drop_duplicates().to_records(index=False))
all_rnas = list(positive_df[['rna_seq', 'rna_chain', 'pdb_id']].drop_duplicates().to_records(index=False))

# Shuffle for random selection
random.shuffle(all_proteins)
random.shuffle(all_rnas)

# === ✅ Generate negative pairs ===
negative_pairs = []
attempts = 0
needed = len(positive_df)
skipped = 0

pbar = tqdm(total=needed, desc="🔁 Generating negative pairs")

while len(negative_pairs) < needed and attempts < 20 * needed:
    prot_seq, prot_chain, prot_pdb = random.choice(all_proteins)
    rna_seq, rna_chain, rna_pdb = random.choice(all_rnas)

    if (prot_seq, rna_seq) not in positive_pairs:
        entry = {
            "pdb_id": f"{prot_pdb}_{rna_pdb}_neg",
            "protein_chain": prot_chain,
            "protein_seq": prot_seq,
            "rna_chain": rna_chain,
            "rna_seq": rna_seq
        }
        negative_pairs.append(entry)
        positive_pairs.add((prot_seq, rna_seq))  # prevent reuse
        pbar.update(1)
    else:
        skipped += 1
    attempts += 1

pbar.close()

# === ✅ Convert to DataFrame and save ===
neg_df = pd.DataFrame(negative_pairs)

# Save CSV
neg_df.to_csv(output_csv_path, index=False)

# Save Pickle
with open(output_pkl_path, 'wb') as f:
    pickle.dump(negative_pairs, f)

# === ✅ Summary ===
print("\n📊 Summary")
print(f"Positive samples input      : {needed}")
print(f"Negative samples generated  : {len(neg_df)}")
print(f"Total sampling attempts     : {attempts}")
print(f"Pairs skipped (was positive): {skipped}")
print(f"Saved to:\n  - CSV: {output_csv_path}\n  - PKL: {output_pkl_path}")


Mounted at /content/drive


🔁 Generating negative pairs: 100%|██████████| 2241/2241 [00:00<00:00, 81978.73it/s]



📊 Summary
Positive samples input      : 2241
Negative samples generated  : 2241
Total sampling attempts     : 2289
Pairs skipped (was positive): 48
Saved to:
  - CSV: /content/drive/MyDrive/RNA-peptide/rna_protein_positive_pairs_full_neg.csv
  - PKL: /content/drive/MyDrive/RNA-peptide/rna_protein_positive_pairs_full_neg.pkl


In [2]:
# Load if not already
import pandas as pd

pos_path = "/content/drive/MyDrive/RNA-peptide/rna_protein_positive_pairs_full.csv"
neg_path = "/content/drive/MyDrive/RNA-peptide/rna_protein_positive_pairs_full_neg.csv"

df_pos = pd.read_csv(pos_path)
df_neg = pd.read_csv(neg_path)

# Add labels
df_pos['label'] = 1
df_neg['label'] = 0

# Combine
combined_df = pd.concat([df_pos, df_neg], ignore_index=True)
combined_df.to_csv("/content/drive/MyDrive/RNA-peptide/rna_protein_positive_negative_pairs_full_combined.csv", index=False)
print("✅ Combined file saved to rna_protein_positive_negative_pairs_full_combined.csv")


✅ Combined file saved to rna_protein_positive_negative_pairs_full_combined.csv
