In [1]:
import pandas as pd

# Load the dataset
file_path = 'nsclc_epitope_table.xlsx'  # Replace with your file path
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Remove the first row and set the second row as column labels
df_cleaned = df.iloc[1:, :].copy()
df_cleaned.columns = df.iloc[0]

# Keep only the required columns
required_columns = [
    "IEDB IRI",
    "Object Type",
    "Name",
    "Starting Position",
    "Ending Position",
    "Source Molecule",
    "Source Molecule IRI",
    "Source Organism",
]
df_cleaned = df_cleaned[required_columns]

# Fill empty "Source Organism" cells with "not defined"
df_cleaned["Source Organism"] = df_cleaned["Source Organism"].fillna("not defined")

# Save the cleaned dataset to a new Excel file
output_file_path = 'cleaned_dataset.xlsx'  # Replace with your desired output file path
df_cleaned.to_excel(output_file_path, index=False)

print(f"Cleaned dataset saved to {output_file_path}")


Cleaned dataset saved to cleaned_dataset.xlsx


In [None]:
import pandas as pd
from itertools import combinations
from difflib import SequenceMatcher

# Load the dataset
file_path = 'cleaned_dataset.xlsx'  # Replace with your cleaned dataset path
df = pd.read_excel(file_path)

# Remove column E
if 'E' in df.columns:
    df = df.drop(columns=['E'])

# Function to calculate sequence similarity
def calculate_similarity(seq1, seq2):
    return SequenceMatcher(None, seq1, seq2).ratio()  # Ratio of similarity (0 to 1)

# Find all pairs of sequences and their similarity
name_sequences = df['Name'].dropna().unique()  # Drop NaN and ensure unique sequences
similarity_threshold = 0.8  # Define a threshold for partial similarity
similar_sequences = []

for seq1, seq2 in combinations(name_sequences, 2):  # Compare every pair
    similarity = calculate_similarity(seq1, seq2)
    if similarity >= similarity_threshold:
        similar_sequences.append((seq1, seq2, similarity))

# Convert results to a DataFrame
similarity_df = pd.DataFrame(similar_sequences, columns=['Sequence 1', 'Sequence 2', 'Similarity'])

# Statistics
total_sequences = len(name_sequences)
total_pairs = len(list(combinations(name_sequences, 2)))
similar_pairs_count = len(similarity_df)
similarity_percentage = (similar_pairs_count / total_pairs) * 100

# Print statistics
print(f"Total Sequences: {total_sequences}")
print(f"Total Possible Pairs: {total_pairs}")
print(f"Similar Pairs (Similarity >= {similarity_threshold}): {similar_pairs_count}")
print(f"Percentage of Similar Pairs: {similarity_percentage:.2f}%")

# Display the DataFrame of similar sequences
import ace_tools as tools; tools.display_dataframe_to_user(name="Similar Amino Acid Sequences", dataframe=similarity_df)
