<a href="https://colab.research.google.com/github/SrutiGoteti/IIIT-AI-ML-labs-and-projects/blob/main/AIML_Project_Module_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

In [None]:
# Upload Module2_ProjectFiles.zip file here by running this cell

uploaded = files.upload()
loc_ = list(uploaded.keys())[0]
!unzip -q {loc_}

In [None]:
!pip3 -q install biopython

import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm.autonotebook import tqdm

In [None]:
data = pd.read_csv("INDIA_685.csv")
data.tail()

In [None]:
fasta_sequences = SeqIO.parse(open("sequences.fasta"),'fasta')

counter = 0
for fasta in fasta_sequences:
    counter +=1

print("Number of sequences is:", counter)

In [None]:
def calc_score(examplesequence):

    position = 0
    score = 0
    value = 0
    flag = 0

    for base in examplesequence:

        if(base == "A" or base == "a"):
            value = 0
        elif(base == "G" or base == "g"):
            value = 1
        elif(base == "T" or base == "t"):
            value = 2
        elif(base == "C" or base == "c"):
            value = 3
        else:
            flag=1

        score += (4 ** position)* value
        position+=1

    return (flag,score)


In [None]:
k=7
big_matrix = []
sequences_name = []

fasta_sequences = SeqIO.parse(open("sequences.fasta"),'fasta')

for fasta in tqdm(fasta_sequences):

    embedding = np.zeros(4**k, dtype=int)
    sequence = str(fasta.seq)
    sequence_name = fasta.id
    sequences_name.append(sequence_name)

    for j in range(len(sequence)-k+1):
        kmer = sequence[j:j+k]
        flag,score = calc_score(kmer)
        if(flag!=1):
            embedding[score]+=1

    embedding = embedding.tolist()
    big_matrix.append(embedding)

In [None]:
print("Number of sequences:", len(sequences_name))
print("Number of features for each sequence:", len(big_matrix[0]))

In [None]:
from sklearn.decomposition import PCA

pca = PCA() # create a PCA object
pca.fit(big_matrix) # do the math
pca_data = pca.transform(big_matrix) # get PCA coordinates for the matrix

In [None]:
import matplotlib.pyplot as plt

per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels_all = ['PC'+str(x) for x in range(1, len(per_var)+1)]
per_var=per_var[:10]
labels = [str(x) for x in range(1, len(per_var)+1)]

with plt.style.context('dark_background'):
    plt.figure(figsize = (15, 9))
    plt.xlabel("Number of Principal Components")
    plt.ylabel("Percentage of variance explained")
    plt.bar(range(1,len(per_var)+1), per_var, tick_label=labels, color = "aqua")
    plt.plot(range(1,len(per_var)+1), np.cumsum(per_var), color = "red")
    plt.scatter(range(1,len(per_var)+1), np.cumsum(per_var), color = "yellow")

In [None]:
final_df = pd.DataFrame(columns=["StrainID", "PC1", "PC2", "PC3", "State"])
pca_df = pd.DataFrame(pca_data, index=[*sequences_name], columns = labels_all)

for sample in pca_df.index:

    dicti = dict()

    dicti["StrainID"] = sample
    dicti["PC1"] = pca_df.PC1.loc[sample]
    dicti["PC2"] = pca_df.PC2.loc[sample]
    dicti["PC3"] = pca_df.PC3.loc[sample]
    dicti["State"] = data[data["StrainID"]==sample]["State"].tolist()[0]

    final_df = pd.concat([final_df, pd.DataFrame([dicti])], ignore_index=True)

final_df.head()

In [None]:
import plotly.graph_objs as go
import plotly.express as px

In [None]:
for_x = final_df.PC1.tolist()
for_y = final_df.PC2.tolist()
for_states = final_df.State.tolist()
for_hover = final_df.StrainID.tolist()

fig = px.scatter(x=for_x, y=for_y, color=for_states, title="Principal Component Axis",
                color_discrete_map={
                "Stateless": "grey",
                "Tamil Nadu": "green",
                "Telangana": "red",
                "Maharashtra": "orange",
                "Gujarat": "magenta",
                "Delhi": "purple",
                "West Bengal": "aqua",
                "Haryana": "pink",
                "Odisha": "blue",
                "Madhya Pradesh": "brown",
                "Karnataka": "black"
                })

fig.show(renderer = "colab")

In [None]:
from sklearn.manifold import TSNE

# Use these parameters
random_state = 0
n_components = 2
perplexity = 30
n_iter = 10000

####### Enter code below #######

model = TSNE(n_components=2, random_state=random_state, perplexity=perplexity, n_iter=n_iter)
tsne_data = model.fit_transform(np.array(big_matrix))

################################

In [None]:
final_df = pd.DataFrame(columns=["StrainID", "X", "Y", "State"])
pca_df = pd.DataFrame(pca_data, index=[*sequences_name], columns = labels_all)

i = 0
for sample in pca_df.index:

    dicti = dict()

    dicti["StrainID"] = sample
    dicti["X"] = tsne_data[i][0]
    dicti["Y"] = tsne_data[i][1]
    dicti["State"] = data[data["StrainID"]==sample]["State"].tolist()[0]

    final_df = pd.concat([final_df, pd.DataFrame([dicti])], ignore_index=True)
    i+=1

final_df.tail()

In [None]:
for_x = final_df.X.tolist()
for_y = final_df.Y.tolist()
for_states = final_df.State.tolist()
for_hover = final_df.StrainID.tolist()

fig = px.scatter(x=for_x, y=for_y, color=for_states, title="t-SNE", height=600, width=1000,
                color_discrete_map={
                "Stateless": "grey",
                "Tamil Nadu": "green",
                "Telangana": "red",
                "Maharashtra": "orange",
                "Gujarat": "magenta",
                "Delhi": "purple",
                "West Bengal": "aqua",
                "Haryana": "pink",
                "Odisha": "blue",
                "Madhya Pradesh": "brown",
                "Karnataka": "black"})

fig.show(renderer = 'colab')

Answers to these questions:

Does making the 3 observations become easier in this plot? Can you make any other new observations from this plot?

Make sure you tweak the parameters to find out if a certain different setting is allowing you to get a more interpretable plot.

Observation 1: Odisha (Blue) scattered from the main cluster

Yes, this is even more obvious now. Odisha (dark blue) forms a clear band separated on the far right side of the plot — strongly suggesting a different mutational profile compared to the main body of samples.

Observation 2: Secondary cluster of Telangana (Red), Delhi (Purple), and Tamil Nadu (Green)

This secondary cluster is much clearer in the upper portion of the plot:

You can see a concentrated cluster containing mostly red (Telangana), purple (Delhi), and some green (Tamil Nadu) — this supports the hypothesis that these states shared a common viral strain lineage.

Their separation from other states is better captured than in the PCA plot.

Observation 3: Dense Gujarat (Pink) cluster within main cluster

Yes, Gujarat (pink/magenta) points are still densely packed, especially in the lower half of the central and lower-right regions of the plot.

 All three PCA-based observations are even easier to interpret with t-SNE due to better local clustering and spread.

Gujurat’s pink dots are widely spread across the plot, with no strong cluster. This may suggest:

Multiple sources of viral introduction

High genomic diversity

Karnataka (black dots at bottom-left)
There is a small, tight cluster of black points near X ≈ -15, Y ≈ 0, isolated from the rest of the data. This suggests a distinct viral lineage in that region.

Telengana samples form two or more separate bands, suggesting the presence of distinct sub-lineages circulating in the state.

Yes, tweaking parameters like perplexity and random_state can improve cluster visibility:

Lower perplexity sharpens small clusters.

Higher perplexity smoothens larger clusters.

In [None]:
## First we need to get the list of all unique mutations observed in our dataset.

mutations = []
for i in range(len(data)):

    sample_muts = data["Nuc_muts"][i][1:][:-1].split(",")

    for j in sample_muts:
        mut = j.strip()[1:][:-1]
        if(len(mut)>=2):
            if(mut not in mutations):
                mutations.append(mut)

print(len(mutations))

In [None]:
samplelist = data["StrainID"].tolist()
df = pd.DataFrame(columns = [*samplelist])
df = df.replace(np.nan,0)

for mutation in mutations:

    dicti = dict(zip(samplelist, np.zeros(len(samplelist), dtype=int)))

    for i in range(len(data)):
        flag=0
        strainid = data["StrainID"][i]
        sample_muts = data["Nuc_muts"][i][1:][:-1].split(",")

        for j in sample_muts:
            mut = j.strip()[1:][:-1]
            if(len(mut)>=2):
                if(mut == mutation):
                    flag = 1
                    break

        dicti[strainid]=flag

    df = pd.concat([df, pd.DataFrame([dicti])], ignore_index=True)

df = df.T

df.tail()

In [None]:
pca = PCA()
pca.fit(df)
pca_data = pca.transform(df)

In [None]:
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels_all = ['PC'+str(x) for x in range(1, len(per_var)+1)]
per_var=per_var[:10]
labels = [str(x) for x in range(1, len(per_var)+1)]

with plt.style.context('dark_background'):
    plt.figure(figsize = (15, 9))
    plt.xlabel("Number of Principal Components")
    plt.ylabel("Percentage of variance explained")
    plt.bar(range(1,len(per_var)+1), per_var, tick_label=labels, color = "aqua")
    plt.plot(range(1,len(per_var)+1), np.cumsum(per_var), color = "red")
    plt.scatter(range(1,len(per_var)+1), np.cumsum(per_var), color = "yellow")

In [None]:
final_df = pd.DataFrame(columns=["StrainID", "PC1", "PC2", "State"])
pca_df = pd.DataFrame(pca_data, index=[*samplelist], columns = labels_all)

for sample in pca_df.index:

    dicti = dict()

    dicti["StrainID"] = sample
    dicti["PC1"] = pca_df.PC1.loc[sample]
    dicti["PC2"] = pca_df.PC2.loc[sample]
    dicti["State"] = data[data["StrainID"]==sample]["State"].tolist()[0]

    final_df = pd.concat([final_df, pd.DataFrame([dicti])], ignore_index=True)

final_df.tail()

In [None]:
for_x = final_df.PC1.tolist()
for_y = final_df.PC2.tolist()
for_states = final_df.State.tolist()
for_hover = final_df.StrainID.tolist()

fig = px.scatter(x=for_x, y=for_y, color=for_states, height=600, width=1200,
                 title="Principal Component Axis",
                color_discrete_map={
                "Stateless": "grey",
                "Tamil Nadu": "green",
                "Telangana": "red",
                "Maharashtra": "orange",
                "Gujarat": "magenta",
                "Delhi": "purple",
                "West Bengal": "aqua",
                "Haryana": "pink",
                "Odisha": "blue",
                "Madhya Pradesh": "brown",
                "Karnataka": "black"})
fig.show(renderer = 'colab')

1. The purple dots for Gujarat are grouped closely on the right. This means the virus in Gujarat had similar changes (mutations) in most cases.

2. Dots for Maharashtra (orange), Telangana (red), and Tamil Nadu (green) are close together in some places. This shows they may have some common mutations, but also some differences.

3. Some dots from Odisha, Delhi, and Karnataka are far away from the others. These could be rare or different virus types, or maybe came from people who traveled.


4. The plot helps us see how similar or different the virus is across states, and shows how it might have spread in the country.


In [None]:
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np
import pandas as pd

random_state = 42
n_components = 2
perplexity = 30
n_iter = 1000

model = TSNE(n_components=n_components, random_state=random_state, perplexity=perplexity, n_iter=n_iter, init='pca', learning_rate='auto')
tsne_data = model.fit_transform(df)

final_df = pd.DataFrame(columns=["StrainID", "X", "Y", "State"])
samplelist = df.index.tolist()

for i, sample in enumerate(samplelist):
    strain_id = sample
    x = tsne_data[i][0]
    y = tsne_data[i][1]
    state = data[data["StrainID"] == strain_id]["State"].tolist()[0]
    final_df = pd.concat([final_df, pd.DataFrame([{"StrainID": strain_id, "X": x, "Y": y, "State": state}])], ignore_index=True)

fig = px.scatter(
    x=final_df["X"],
    y=final_df["Y"],
    color=final_df["State"],
    title="t-SNE on Mutation Profiles",
    height=600,
    width=1000,
    color_discrete_map={
        "Stateless": "grey",
        "Tamil Nadu": "green",
        "Telangana": "red",
        "Maharashtra": "orange",
        "Gujarat": "magenta",
        "Delhi": "purple",
        "West Bengal": "aqua",
        "Haryana": "pink",
        "Odisha": "blue",
        "Madhya Pradesh": "brown",
        "Karnataka": "black"
    }
)

fig.show(renderer="colab")


1. Gujarat (magenta) points form a tight group on the right side of the plot. This means most sequences from Gujarat have similar mutations.

2. Telangana (red), Maharashtra (orange), and Karnataka (black) are mixed in two nearby clusters, suggesting that some strains from these states are genetically similar.

3. Odisha (blue) and West Bengal (aqua) are placed together in a tight group, showing similar mutation profiles.

4. A few points from states like Delhi (purple), Tamil Nadu (green), and Kerala (brown) are far away from the main clusters. These could be unique strains or outliers.

5. Overall, t-SNE was able to separate some states clearly and gave a better view of possible mutation clusters.


In [None]:
from sklearn.manifold import Isomap

n_neighbors = 5
n_components = 2

model_iso = Isomap(n_neighbors=n_neighbors, n_components=n_components)
iso_data = model_iso.fit_transform(df)

final_df = pd.DataFrame(columns=["StrainID", "X", "Y", "State"])
samplelist = df.index.tolist()

for i, sample in enumerate(samplelist):
    strain_id = sample
    x = iso_data[i][0]
    y = iso_data[i][1]
    state = data[data["StrainID"] == strain_id]["State"].tolist()[0]
    final_df = pd.concat([final_df, pd.DataFrame([{"StrainID": strain_id, "X": x, "Y": y, "State": state}])], ignore_index=True)

fig = px.scatter(
    x=final_df["X"],
    y=final_df["Y"],
    color=final_df["State"],
    title="ISOMAP on Mutation Profiles",
    height=600,
    width=1000
)

fig.show(renderer="colab")
