# PREPROCESSING OF DNA DATASET

## Import Library

In [9]:
from glob import glob

#data handling
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing 

#classification
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense


### Combine multiple csv file

In [21]:
data_file = sorted(glob('D:\DNA project\Dataset/*csv'))
data_file

['D:\\DNA project\\Dataset\\BladderCancer.csv',
 'D:\\DNA project\\Dataset\\BoneCancer.csv',
 'D:\\DNA project\\Dataset\\BrainCancer.csv',
 'D:\\DNA project\\Dataset\\ColorectalCancer.csv',
 'D:\\DNA project\\Dataset\\Head_and_neckCancer.csv']

In [22]:
data = pd.concat(pd.read_csv(datafile)
                for datafile in data_file)

In [26]:
data

Unnamed: 0,sequence,len,cancer
0,TGCTAACAGTCTTGCAGGTCTCCCGAG,27,Bladder Cancer
1,AGAGTCGGTCGGAGGCTCTGGCTG,24,Bladder Cancer
2,CACCGTCAGTGCCGTGTTCCAGG,23,Bladder Cancer
3,AGAGCTAAAGTCCAAGAGAGGATCCGAGAACG,32,Bladder Cancer
4,ACGCGGCCAGTGCAAGGCAT,20,Bladder Cancer
...,...,...,...
351,AAGACATCACGATGGATCACAGGTCTATCACACCTATTAACCACTC...,616,Head_and_neck Cancer
352,GGAGCTCTCCATGCATCTGGTATTTTCGTCTGGGGGGTGTGCACGC...,546,Head_and_neck Cancer
353,TCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTG...,557,Head_and_neck Cancer
354,ATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTG...,564,Head_and_neck Cancer


In [27]:
data.to_csv('DNAdata.csv')

In [4]:
data = pd.read_csv('DNAdata.csv')

In [5]:
data

Unnamed: 0.1,Unnamed: 0,sequence,len,cancer
0,0,TGCTAACAGTCTTGCAGGTCTCCCGAG,27,Bladder Cancer
1,1,AGAGTCGGTCGGAGGCTCTGGCTG,24,Bladder Cancer
2,2,CACCGTCAGTGCCGTGTTCCAGG,23,Bladder Cancer
3,3,AGAGCTAAAGTCCAAGAGAGGATCCGAGAACG,32,Bladder Cancer
4,4,ACGCGGCCAGTGCAAGGCAT,20,Bladder Cancer
...,...,...,...,...
753,351,AAGACATCACGATGGATCACAGGTCTATCACACCTATTAACCACTC...,616,Head_and_neck Cancer
754,352,GGAGCTCTCCATGCATCTGGTATTTTCGTCTGGGGGGTGTGCACGC...,546,Head_and_neck Cancer
755,353,TCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTG...,557,Head_and_neck Cancer
756,354,ATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTG...,564,Head_and_neck Cancer


In [5]:
data.drop(data.columns[[0]], axis=1, inplace=True)

In [6]:
data

Unnamed: 0,len,cancer,seq
0,27,Bladder Cancer,"['tgctaa', 'gctaac', 'ctaaca', 'taacag', 'aaca..."
1,24,Bladder Cancer,"['agagtc', 'gagtcg', 'agtcgg', 'gtcggt', 'tcgg..."
2,23,Bladder Cancer,"['caccgt', 'accgtc', 'ccgtca', 'cgtcag', 'gtca..."
3,32,Bladder Cancer,"['agagct', 'gagcta', 'agctaa', 'gctaaa', 'ctaa..."
4,20,Bladder Cancer,"['acgcgg', 'cgcggc', 'gcggcc', 'cggcca', 'ggcc..."
...,...,...,...
753,616,Head_and_neck Cancer,"['aagaca', 'agacat', 'gacatc', 'acatca', 'catc..."
754,546,Head_and_neck Cancer,"['ggagct', 'gagctc', 'agctct', 'gctctc', 'ctct..."
755,557,Head_and_neck Cancer,"['tcacgg', 'cacggg', 'acggga', 'cgggag', 'ggga..."
756,564,Head_and_neck Cancer,"['attaac', 'ttaacc', 'taacca', 'aaccac', 'acca..."


## K-Mer

In [8]:
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [9]:
data['seq'] = data.apply(lambda x: getKmers(x['sequence']), axis=1)



In [10]:
data = data.drop('sequence', axis=1)

In [11]:
data.head()

Unnamed: 0,len,cancer,seq
0,27,Bladder Cancer,"[tgctaa, gctaac, ctaaca, taacag, aacagt, acagt..."
1,24,Bladder Cancer,"[agagtc, gagtcg, agtcgg, gtcggt, tcggtc, cggtc..."
2,23,Bladder Cancer,"[caccgt, accgtc, ccgtca, cgtcag, gtcagt, tcagt..."
3,32,Bladder Cancer,"[agagct, gagcta, agctaa, gctaaa, ctaaag, taaag..."
4,20,Bladder Cancer,"[acgcgg, cgcggc, gcggcc, cggcca, ggccag, gccag..."


In [16]:
human_texts = list(data['seq'])
for item in range(len(human_texts)):
    human_texts[item] = ' '.join(human_texts[item])
   

In [12]:
data.to_csv('DNAdata_Kmer.csv')

## Label Encoding

In [3]:
data = pd.read_csv('DNAdata_Kmer.csv')

In [None]:
data.drop(data.columns[[0]], axis=1, inplace=True)

In [7]:
data

Unnamed: 0,len,cancer,seq
0,27,Bladder Cancer,"['tgctaa', 'gctaac', 'ctaaca', 'taacag', 'aaca..."
1,24,Bladder Cancer,"['agagtc', 'gagtcg', 'agtcgg', 'gtcggt', 'tcgg..."
2,23,Bladder Cancer,"['caccgt', 'accgtc', 'ccgtca', 'cgtcag', 'gtca..."
3,32,Bladder Cancer,"['agagct', 'gagcta', 'agctaa', 'gctaaa', 'ctaa..."
4,20,Bladder Cancer,"['acgcgg', 'cgcggc', 'gcggcc', 'cggcca', 'ggcc..."
...,...,...,...
753,616,Head_and_neck Cancer,"['aagaca', 'agacat', 'gacatc', 'acatca', 'catc..."
754,546,Head_and_neck Cancer,"['ggagct', 'gagctc', 'agctct', 'gctctc', 'ctct..."
755,557,Head_and_neck Cancer,"['tcacgg', 'cacggg', 'acggga', 'cgggag', 'ggga..."
756,564,Head_and_neck Cancer,"['attaac', 'ttaacc', 'taacca', 'aaccac', 'acca..."


In [10]:
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'cancer'. 
data['cancer']= label_encoder.fit_transform(data['cancer']) 
  
data['cancer'].unique() 

array([0, 1, 2, 3, 4])

In [11]:
data

Unnamed: 0,len,cancer,seq
0,27,0,"['tgctaa', 'gctaac', 'ctaaca', 'taacag', 'aaca..."
1,24,0,"['agagtc', 'gagtcg', 'agtcgg', 'gtcggt', 'tcgg..."
2,23,0,"['caccgt', 'accgtc', 'ccgtca', 'cgtcag', 'gtca..."
3,32,0,"['agagct', 'gagcta', 'agctaa', 'gctaaa', 'ctaa..."
4,20,0,"['acgcgg', 'cgcggc', 'gcggcc', 'cggcca', 'ggcc..."
...,...,...,...
753,616,4,"['aagaca', 'agacat', 'gacatc', 'acatca', 'catc..."
754,546,4,"['ggagct', 'gagctc', 'agctct', 'gctctc', 'ctct..."
755,557,4,"['tcacgg', 'cacggg', 'acggga', 'cgggag', 'ggga..."
756,564,4,"['attaac', 'ttaacc', 'taacca', 'aaccac', 'acca..."
