In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")
import glob

from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.utils import resample

In [None]:
# Function
def DataDescription(data):
   stat = pd.DataFrame(data.groupby(['Class']).agg(['count']).iloc[:,-1])
   stat.columns = ['Count']
   stat['Percentage'] = round(stat['Count']/stat['Count'].sum()*100,3)
   print(stat) 

def Normalization(data): # Normalized data
    RS = preprocessing.RobustScaler()
    scaled_df = pd.DataFrame(RS.fit_transform(data.iloc[:,:-1]) , columns= data.iloc[:,:-1].columns, index=data.index)
    scaled_df['Target'] = data['Class']
    return scaled_df

def ModelFitPrediction (X_train, X_test, y_train, y_test, model ): # For evaluating model performance
    print('Number of test set: ', X_test.shape[0])

    # Model_Name = 'RandomForestClassifier'
    
    model.fit(X_train, y_train) # step 2: fit
    y_pred=model.predict(X_test) # step 3: predict
    print('Accuracy score:', model.score(X_test, y_test).round(3)) # step 4: accuracy score for classification r2 for regression
    # classes=list(set(data['Class']))
    classes=['AIS','HSIL','Normal']
    print(metrics.classification_report(y_test, y_pred, target_names=classes))
    y_pred_prob = model.predict_proba(X_test)[::,1]

    # show classification result
    result = pd.concat([y_test.reset_index(),pd.DataFrame([y_pred,y_pred_prob]).T],axis=1)
    result.columns = ['Patient','Class','Predicted','Predicted_Prob']
    print(result)
    return model

def FoldCrossValidation(scaled_df, model): # For describing model generalization
    print('Performance of classification model before feature selection')
    cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
    X=scaled_df.iloc[:,:-1]
    print('Number of all features ', X.shape)
    y=scaled_df['Target']
    scores = cross_val_score(model, X,y, scoring='accuracy',cv=cv,n_jobs=-1)
    print('Scores: ', np.round(scores,4))
    print('Accuracy: %.3f (%.3f)'% (np.mean(scores),np.std(scores)))

    # Feature selection by using Random Forest
    importance = model.feature_importances_
    importance_score = pd.DataFrame([X.columns, importance]).T
    importance_score.columns = ['miRNA','Score']
    importance_score = importance_score.sort_values(by='Score', ascending=False)
    miRNA1 = importance_score[importance_score['Score'] > 0]['miRNA']

    print("\n",'*'*30)
    print('Performance of classification model after feature selection')
    cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
    X=scaled_df.loc[:,miRNA1]
    print('Number of Selected features ', X.shape)
    y=scaled_df['Target']
    scores = cross_val_score(model, X,y, scoring='accuracy',cv=cv,n_jobs=-1)
    print('Scores: ', np.round(scores,4))
    print('Accuracy: %.3f (%.3f)'% (np.mean(scores),np.std(scores)))

    return miRNA1

In [None]:
base ='/content/drive/MyDrive/ML3_Noiseq_biomarker/'

In [None]:
# 1. Read Raw data
# df = pd.read_csv(base+'merge set 2and4.csv').T
df = pd.read_csv(base+'miRNA biomarker q0.9.csv').T
df = df.rename(columns=df.iloc[0]).iloc[1:,:]

label_df = pd.read_csv(base+'Factors set2and4 (3group).csv').iloc[:,:3]
label_df['Type'] = label_df['Type'].str.replace(' ','')
label_df['Class'] = label_df['Type']
label_df = label_df.set_index('Sample Name')

rawData = pd.concat([df,label_df['Class']], axis=1).fillna(0)
data = rawData

# 2. Show data description
print('\n', '2. Data Description')
DataDescription(data=data)
scaled_df = Normalization(data=data)
print('\n')




 2. Data Description
        Count  Percentage
Class                    
AIS         8      11.111
HSIL       59      81.944
Normal      5       6.944




In [None]:
# 3. Train and Test model 
# Train and Test spliting with 0.67 and 0.33 respectively
print('\n', '3. Performance of Model Classification')
X_train, X_test, y_train, y_test = train_test_split(scaled_df.iloc[:,:-1], scaled_df['Target'], 
                                                    test_size=0.33) # , random_state=7
model=RandomForestClassifier() # step 1: choose model/estimator for classification
model = ModelFitPrediction (X_train, X_test, y_train, y_test, model )




 3. Performance of Model Classification
Number of test set:  24
Accuracy score: 0.708
              precision    recall  f1-score   support

         AIS       0.00      0.00      0.00         2
        HSIL       0.81      0.85      0.83        20
      Normal       0.00      0.00      0.00         2

    accuracy                           0.71        24
   macro avg       0.27      0.28      0.28        24
weighted avg       0.67      0.71      0.69        24

   Patient   Class Predicted Predicted_Prob
0   DC048V    HSIL       AIS           0.47
1     037A    HSIL      HSIL           0.87
2     049C     AIS      HSIL           0.57
3     047C    HSIL      HSIL           0.94
4   DC054C    HSIL      HSIL           0.87
5     040A    HSIL      HSIL           0.75
6     055B    HSIL      HSIL           0.87
7     052C    HSIL      HSIL            0.9
8     044B  Normal      HSIL           0.46
9   DC054V    HSIL      HSIL           0.85
10    050A    HSIL      HSIL            0.9
11  

In [None]:
# 4. Show model generalization
print('\n', '4. Model generalization')
miRNA = FoldCrossValidation(scaled_df, model)




 4. Model generalization
Performance of classification model before feature selection
Number of all features  (72, 7)
Scores:  [0.75   0.625  0.7143 0.8571 0.8571 0.8571 0.8571 0.8571 0.7143 0.7143]
Accuracy: 0.780 (0.082)

 ******************************
Performance of classification model after feature selection
Number of Selected features  (72, 7)
Scores:  [0.75   0.625  0.8571 0.8571 1.     0.8571 0.8571 0.8571 0.8571 0.7143]
Accuracy: 0.823 (0.097)


In [None]:
# 5. Generate model with Feature selection
print('\n', '5. Performance of Model Classification after Feature selection')
X_train = X_train.loc[:,miRNA]
X_test = X_test.loc[:,miRNA]
modelSelection = ModelFitPrediction (X_train, X_test, y_train, y_test, model )


 5. Performance of Model Classification after Feature selection
Number of test set:  24
Accuracy score: 0.75
              precision    recall  f1-score   support

         AIS       0.00      0.00      0.00         2
        HSIL       0.82      0.90      0.86        20
      Normal       0.00      0.00      0.00         2

    accuracy                           0.75        24
   macro avg       0.27      0.30      0.29        24
weighted avg       0.68      0.75      0.71        24

   Patient   Class Predicted Predicted_Prob
0   DC048V    HSIL      HSIL           0.52
1     037A    HSIL      HSIL           0.84
2     049C     AIS      HSIL           0.56
3     047C    HSIL      HSIL           0.94
4   DC054C    HSIL      HSIL           0.86
5     040A    HSIL      HSIL           0.85
6     055B    HSIL      HSIL           0.88
7     052C    HSIL      HSIL           0.92
8     044B  Normal      HSIL           0.63
9   DC054V    HSIL      HSIL           0.89
10    050A    HSIL      H

In [None]:
# 6. 3 times Upsampling on AIS and Normal group

up_df = pd.DataFrame()
for c in ['AIS', 'Normal']:
    sampling = data[data['Class'] == c]
    up_sampling = resample(sampling, replace=True, n_samples=sampling.shape[0]*3, random_state=123)
    up_df = up_df.append(up_sampling)

hsil  = data[data['Class']== 'HSIL']

new_data = hsil.append(up_df)

# 7. Show data description
DataDescription(data=new_data)
scaled_df = Normalization(data=new_data)
print('\n')


# 8. Train and Test model 
# Train and Test spliting with 0.67 and 0.33 respectively
X_train, X_test, y_train, y_test = train_test_split(scaled_df.iloc[:,:-1], scaled_df['Target'], 
                                                    test_size=0.33) # , random_state=7
model=RandomForestClassifier() # step 1: choose model/estimator for classification
model = ModelFitPrediction (X_train, X_test, y_train, y_test, model )

# 9. Show model generalization
print('\n', 'Model generalization')
miRNA = FoldCrossValidation(scaled_df, model)

# 10. Generate model with Feature selection
print('\n', 'Model with Feature selection')
X_train = X_train.loc[:,miRNA]
X_test = X_test.loc[:,miRNA]
modelSelection = ModelFitPrediction (X_train, X_test, y_train, y_test, model )

        Count  Percentage
Class                    
AIS        24      24.490
HSIL       59      60.204
Normal     15      15.306


Number of test set:  33
Accuracy score: 0.848
              precision    recall  f1-score   support

         AIS       1.00      0.60      0.75        10
        HSIL       0.77      1.00      0.87        17
      Normal       1.00      0.83      0.91         6

    accuracy                           0.85        33
   macro avg       0.92      0.81      0.84        33
weighted avg       0.88      0.85      0.84        33

   Patient   Class Predicted Predicted_Prob
0     041B    HSIL      HSIL            0.9
1     049A     AIS       AIS           0.21
2   DC049E     AIS      HSIL            0.9
3     050C  Normal      HSIL            1.0
4     044A  Normal    Normal            0.0
5     048D    HSIL      HSIL           0.55
6     046B    HSIL      HSIL           0.97
7     052A    HSIL      HSIL            1.0
8     044C  Normal    Normal           0.13
9

In [None]:
# output = base + 'Importance_Feature.csv'
# miRNA.to_csv(output)