<a href="https://colab.research.google.com/github/Xami-20/IBD_prediction/blob/main/kmer-based_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Packages**

## **Installing Packages**

In [None]:
!python -m pip install numpy
!python -m pip install pandas
!python -m pip install sklearn
!python -m pip install matplotlib
!python -m pip install seaborn
!python -m pip install tensorflow

## **Importing Packages**

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # Python statistical data visualization library based on matplotlib
import tensorflow as tf

from sklearn.utils import resample # BootStrapping
from sklearn.model_selection import train_test_split # Splitting the dataset into 70% Training set and 30% Testing set
from sklearn.preprocessing import MinMaxScaler # Feature Scaling

from sklearn.ensemble import RandomForestClassifier as RF # Importing random forest classifier from sklearn
from sklearn.svm import SVC # Using SVC method of svm class to use linear Kernel Support Vector Machine Algorithm

from sklearn.metrics import classification_report # Creating Classification Report
from sklearn.metrics import confusion_matrix # Creating Confusion Matrix 
from sklearn.metrics import plot_confusion_matrix # plotting Confusion matrix

# **Data**

## **Getting Data Details**

In [None]:
def data_details(file): # file name = "[4-25]-mers_decimal_PRJEB13679.csv"
    file = "./kmers/labeled/" + file    
    df = pd.read_csv(file)
    
    # droping the sample name 
    df = df.drop(columns=["sample"])
    
    # Print The number of rows and columns in the data set
    print(df.shape)

    # Print The new number of rows and columns in the data set
    # print(df.shape)
    
#     # Visualize diagnosis counts
#     plt.figure(figsize = (50,50))
#     sns_plot = sns.countplot(df['diagnosis'],label="Count")
#     plt.rcParams.update({'font.size': 150})
#     fig = sns_plot.get_figure()
#     fig.savefig("Diagnosis.png")

    # Look at the data types to see which columns need to be transformed / encoded to a number
    print(df.dtypes)

## **Pre-Processing**

In [None]:
def data_preparation(file,n=None):

    df = pd.read_csv(file)
    df = df.drop(columns=["sample"])
    # df.iloc[:,0]
    df.pivot_table(columns=['diagnosis'], aggfunc='size')
    
    # removing other IBDs diagnosis since the number of availbe samples is a limited representation
    # CD    731
    # IC     73
    # UC    219
    # no    336
    # df = df.loc[(df["diagnosis"] == "CD") | (df["diagnosis"] == "no")]
    
    if type(n)!=type(None):
        # BootStrapping resampling the data 
        from sklearn.utils import resample
        df = resample(df, random_state=n, n_samples=n, replace=True)
    
#     # Change all diagnoses col to numical representation
#     from sklearn.preprocessing import LabelEncoder
#     labelencoder_Y = LabelEncoder()
#     df.iloc[:,0] = labelencoder_Y.fit_transform(df.iloc[:,0].values)
#     # print(df.iloc[:,0])
    
    # Split the data into independent 'X' and dependent 'Y' variables
    X = df.iloc[:, 1:].values 
    Y = df.iloc[:, 0].values # Get the target variable 'diagnosis' located at index=1

    
    # Splitting the dataset into 70% Training set and 30% Testing set
    from sklearn.model_selection import train_test_split
    X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X, Y, test_size = 0.3, random_state = 0)

    # Scale the data to bring all features to the same level of magnitude

    # Feature Scaling
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    X_train_data = scaler.fit_transform(X_train_data)
    X_test_data = scaler.transform(X_test_data)
    res = [X_train_data, X_test_data, Y_train_data, Y_test_data]
    return res

In [None]:
import os
files = sorted(os.listdir("./kmers/labeled/"))
file = "./kmers/labeled/" + files[0]
df = pd.read_csv(file)
df.pivot_table(columns=['diagnosis'], aggfunc='size')

# **Models**

## **RF**

In [None]:
def RandomForest_classifier(X_train_data, X_test_data, Y_train_data, Y_test_data,trees = 100):
    # Importing random forest classifier from sklearn
    from sklearn.ensemble import RandomForestClassifier as RF
    # Creating the Rf Classifier
    RF_classifier = RF(n_estimators = trees, criterion = "entropy", random_state = 0)

    # Fit the model using the training dataset
    RF_classifier.fit(X_train_data, Y_train_data)

    # Predicting the results of the Testing dataset
    prediction = RF_classifier.predict(X_test_data)

    trainscore = RF_classifier.score(X_train_data, Y_train_data)*100
    testscore = RF_classifier.score(X_test_data, Y_test_data)*100
#     print('\nTrees: {}\nRandom Forest Classifier Training Accuracy: {:.2f} %'.format(trees,trainscore))
#     print('Random Forest Classifier Testing Accuracy: {:.2f} %'.format(testscore))

    target_names = ['CD','IC','UC','no']
    
    from sklearn.metrics import classification_report
    report = classification_report(Y_test_data, prediction,labels=target_names)
#     print(report)
    
    # Creating Confusion Matrix 
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test_data, prediction,labels=target_names)
#     print(cm)
    
    # plotting Confusion matrix
    from sklearn.metrics import plot_confusion_matrix
#     plot_confusion_matrix(RF_classifier,X_test_data, Y_test_data)
    
    model = 'RF_{}-Trees'.format(trees)
    return {'Model': model,'Train Score':trainscore, 'Test Score': testscore,
            'Classification Report':report,'Confusion Matrix':cm}


## **SVM**

In [None]:
def svm_linear_model(X_train_data, X_test_data, Y_train_data, Y_test_data): 
    # Using SVC method of svm class to use linear Kernel Support Vector Machine Algorithm
    from sklearn.svm import SVC
    svc_lin = SVC(kernel = 'linear', random_state = 0)
    svc_lin.fit(X_train_data, Y_train_data)
    
    # Predicting the results of the Testing dataset
    prediction = svc_lin.predict(X_test_data)
    
    trainscore = svc_lin.score(X_train_data, Y_train_data)*100
    testscore = svc_lin.score(X_test_data, Y_test_data)*100
#     print('Support Vector Machine (Linear Classifier) Training Accuracy:', "{:.2f} %".format(trainscore))
#     print('Support Vector Machine (Linear Classifier) Testing Accuracy:', "{:.2f} %".format(testscore))
    
    target_names = ['CD','IC','UC','no']
    
    from sklearn.metrics import classification_report
    report = classification_report(Y_test_data, prediction,labels=target_names)
#     print(report)
    
    # Creating Confusion Matrix 
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test_data, prediction,labels=target_names)
#     print(cm)
    
    # Plotting Confusion matrix
    from sklearn.metrics import plot_confusion_matrix
#     plot_confusion_matrix(svc_lin,X_test_data, Y_test_data)
        
    return {'Model': 'SVM (Linear)','Train Score':trainscore, 'Test Score': testscore,
            'Classification Report':report,'Confusion Matrix':cm}

def svm_poly_model(X_train_data, X_test_data, Y_train_data, Y_test_data):
    # Using SVC method of svm class to use poly Kernel SVM Algorithm
    from sklearn.svm import SVC
    svc_poly = SVC(kernel = 'poly', random_state = 0)
    svc_poly.fit(X_train_data, Y_train_data)
        
    # Predicting the results of the Testing dataset
    prediction = svc_poly.predict(X_test_data)
    
    trainscore = svc_poly.score(X_train_data, Y_train_data)*100
    testscore = svc_poly.score(X_test_data, Y_test_data)*100
#     print('Support Vector Machine (Poly Classifier) Training Accuracy:', "{:.2f} %".format(trainscore))
#     print('Support Vector Machine (Poly Classifier) Testing Accuracy:', "{:.2f} %".format(testscore))
    
    target_names = ['CD','IC','UC','no']
    
    from sklearn.metrics import classification_report
    report = classification_report(Y_test_data, prediction,labels=target_names)
#     print(report)
    
    # Creating Confusion Matrix 
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test_data, prediction,labels=target_names)
#     print(cm)
    
    # Plotting Confusion matrix
    from sklearn.metrics import plot_confusion_matrix
#     plot_confusion_matrix(svc_poly,X_test_data, Y_test_data)
        
    return {'Model': 'SVM (Poly)','Train Score':trainscore, 'Test Score': testscore,
            'Classification Report':report,'Confusion Matrix':cm}

def svm_rbf_model(X_train_data, X_test_data, Y_train_data, Y_test_data):
    # Using SVC method of svm class to use rbf Kernel SVM Algorithm
    from sklearn.svm import SVC
    svc_rbf = SVC(kernel = 'rbf', random_state = 0)
    svc_rbf.fit(X_train_data, Y_train_data)
        
    # Predicting the results of the Testing dataset
    prediction = svc_rbf.predict(X_test_data)
    
    trainscore = svc_rbf.score(X_train_data, Y_train_data)*100
    testscore = svc_rbf.score(X_test_data, Y_test_data)*100
#     print('Support Vector Machine (RBF Classifier) Training Accuracy:', "{:.2f} %".format(trainscore))
#     print('Support Vector Machine (RBF Classifier) Testing Accuracy:', "{:.2f} %".format(testscore))
    
    target_names = ['CD','IC','UC','no']
    
    from sklearn.metrics import classification_report
    report = classification_report(Y_test_data, prediction,labels=target_names)
#     print(report)
    
    # Creating Confusion Matrix 
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test_data, prediction,labels=target_names)
#     print(cm)
    
    # Plotting Confusion matrix
    from sklearn.metrics import plot_confusion_matrix
#     plot_confusion_matrix(svc_rbf,X_test_data, Y_test_data)
    
    return {'Model': 'SVM (RBF)','Train Score':trainscore, 'Test Score': testscore,
            'Classification Report':report,'Confusion Matrix':cm}


## **Model Run**

In [None]:
data_files = sorted(os.listdir("./kmers/labeled/"))
for i in data_files:
    file = "./kmers/labeled/" + i
    print(i)
    n_samples = [100, 200, 500, 1000, 2000, 5000, 10000, None]
    for n in n_samples:
#         print("Number of Samples:\t{}".format(n))
        data = data_preparation(file,n)
        
        svm_lin = svm_linear_model(data[0],data[1],data[2],data[3])
#         print("\n")
        poly = svm_poly_model(data[0],data[1],data[2],data[3])
#         print("\n")
        rbf = svm_rbf_model(data[0],data[1],data[2],data[3])
        
        rf50 = RandomForest_classifier(data[0],data[1],data[2],data[3],trees=50)
        rf100 = RandomForest_classifier(data[0],data[1],data[2],data[3],trees=100)
        rf150 = RandomForest_classifier(data[0],data[1],data[2],data[3],trees=150)
#         print("\n")
        if type(n)!=type(None):
            df_name = './Results/' + i[:i.find('_decimal')] + '_' + str(n) + '-Resampling_Models\'Results'  + '.csv'
        else:
            df_name = './Results/' + i[:i.find('_decimal')] + '_All_Samples_Models\'Results'  + '.csv'
        df = pd.DataFrame([svm_lin,poly,rbf,rf50,rf100,rf150])
        df.to_csv(df_name,index=False)
    print("\nNext,\n")
# print(x)