<a href="https://colab.research.google.com/github/Xami-20/IBD_prediction/blob/main/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Packages**

## **Required installation**

In [None]:
# Screed is a Python library for reading in FASTA/FASTQ file
# File can be uncompressed, gzipped, or bz2-zipped
!python -m pip install screed

## **Importing Packages**

In [None]:
import os
import screed # To read Sequence files
from itertools import product
import pandas as pd

# **Kmers**

## **Kmer extraction and encoding functions**

In [None]:
def get_K_mer(seq, k_size):
    # Function to get K-mer from Sequence
    k_mers = []
    num_kmers = len(seq) - k_size + 1
    for i in range(num_kmers):
        k_mer = seq[i:i + k_size]
        k_mers.append(k_mer)
    return k_mers

def seq2binary(kmer):
    binary = ""
    for i in kmer:
        i = i.upper()
        if i == "A":
            binary += "00"
        elif i == "T":
            binary += "01"
        elif i == "C":
            binary += "10"
        elif i == "G":
            binary += "11"
    return binary

def binary2decimal(binary):
    try:
        x = int(binary,2)
        return x
    except:
        return (-1)

def seq_file2kmers(file, k_size):
    Seq_kmers = {}
    for record in screed.open(file):
        seq = record.sequence
        name = record.name
        kmers = get_K_mer(seq, k_size)
        for mer in range(len(kmers)):
            kmers[mer] = binary2decimal(seq2binary(kmers[mer]))
        underscore = name.find("_")
        if underscore > 1:
            n = name[:underscore]
        else:
            n = name
        if n in list(Seq_kmers.keys()):
            Seq_kmers[n].extend(kmers)
        else:
            Seq_kmers[n] = kmers # Dictinary of the Sequence names and their respective k-mers
    return Seq_kmers


## **kmer Reference**

In [None]:
Dbases = "ATCG"

for kmer in range(4,10):
    path = "./possible_kmers/" + str(kmer) + "-mer_reference.csv"
    with open(path,"w") as file:
        file.write("{}-mers,Decimal Value".format(str(kmer)))
        for i in product(Dbases, repeat = kmer):
            binary = ""
            mer = ''.join(i)
            decimal = str(binary2decimal(seq2binary(mer)))
            file.write("\n{},{}".format(str(mer),decimal))
    print('{}'.format(kmer))

## **Generating kmers vector from Reference**

In [None]:
def gen_kmer(kmer_val):
    data_path = "./PRJEB13679_fa/"
    if os.getcwd()[os.getcwd().rfind("\\",)+1:] != data_path[2:-1]:
        os.chdir(data_path)
        # print(os.getcwd())

    path = ".././kmers/" + str(kmer_val) + "-mers_decimal_PRJEB13679.csv"

    output = open(path,'w')
    output.write("sample")
    ref_path = ".././possible_kmers/" + str(kmer_val) + "-mer_reference.csv"
    ref = pd.read_csv(ref_path)
    
    ref_kmers = ref["{}-mers".format(kmer_val)].tolist()
    ref_values = ref['Decimal Value'].tolist()
    del ref
    for i in ref_kmers:
        output.write(",{}".format(i))
    output.write("\n")
    
    ref_dict = dict(zip(ref_kmers, ref_values))
    del ref_kmers
    ref_freq = dict(zip(ref_values,[0]*len(ref_values)))
    del ref_values
                     
    for i in sorted(os.listdir()):
        kmer = seq_file2kmers(i, kmer_val)
        for n, k in kmer.items():
            name = n
            kmers = k
        del kmer
        print(name)
        output.write("{}".format(str(name)))

        for mer in kmers:
            ref_freq[mer] +=1
        for val in ref_freq.values():
            output.write(",{}".format(val))
        output.write("\n")
        del kmers,name

    output.close()
    os.chdir(".././")
    # print(os.getcwd())

In [None]:
for i in range(7,10):
    gen_kmer(i)

1939.121455
1939.121456
1939.121457
1939.121461
1939.121463
1939.121464
1939.121465
1939.121466
1939.121467
1939.121468


# **Phenotypic features**

## **Generating feature files**

In [None]:
df = pd.read_csv('./pheno_features/1359_PRJEB13679_attributes.csv')
# df = df[["library_name","total_spots","total_size","run_total_spots","run_total_bases","age","biopsy_location","body_site","diagnosis","disease_extent","disease_stat","diseasesubtype","elevation","gastrointest_disord","inflammationstatus","latitude","longitude","race","sex"]]
df = df[["library_name","biopsy_location","body_site","sex","race","age","diagnosis","gastrointest_disord","disease_extent","disease_stat","diseasesubtype","inflammationstatus","latitude","longitude"]]
df = df.drop(columns=["diagnosis"])
df.rename(columns = {'gastrointest_disord':'diagnosis',"library_name":"sample"}, inplace = True)
df.to_csv('./pheno_features/1359_features.csv', index=False)

# Extracting Labels and sex only 
labels = df[["sample","diagnosis","sex"]]
labels.to_csv('./pheno_features/1359_labels.csv', index=False)

# **Adding Labels to Kmer files**

In [None]:
labels = pd.read_csv('./pheno_features/1359_labels.csv')
labels = labels.sort_values("sample")

for i in sorted(os.listdir("./kmers")):
    print(i)
    if i.endswith("decimal_PRJEB13679.csv"):
        kmers = "./kmers/" + i
        df = pd.read_csv(kmers)
        out = pd.merge(labels, df, on = "sample")
        del df
        out = out.drop(columns=["sex"])
        save = "./kmers/labeled/" + i
        out.to_csv(save,index=False)
        # break
# print(df.head(3))
# print(labels.head(3))
# print(out.head(3))
# print(df.shape[1])