In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")
import glob

from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.utils import resample

In [None]:
# Function
def DataDescription(data):
   stat = pd.DataFrame(data.groupby(['Class']).agg(['count']).iloc[:,-1])
   stat.columns = ['Count']
   stat['Percentage'] = round(stat['Count']/stat['Count'].sum()*100,3)
   print(stat) 

def Normalization(data): # Normalized data
    RS = preprocessing.RobustScaler()
    scaled_df = pd.DataFrame(RS.fit_transform(data.iloc[:,:-1]) , columns= data.iloc[:,:-1].columns, index=data.index)
    scaled_df['Target'] = data['Class']
    return scaled_df

def ModelFitPrediction (X_train, X_test, y_train, y_test, model ): # For evaluating model performance
    print('Number of test set: ', X_test.shape[0])

    # Model_Name = 'RandomForestClassifier'
    
    model.fit(X_train, y_train) # step 2: fit
    y_pred=model.predict(X_test) # step 3: predict
    print('Accuracy score:', model.score(X_test, y_test).round(3)) # step 4: accuracy score for classification r2 for regression
    # classes=list(set(data['Class']))
    classes=['AIS','HSIL','Normal']
    print(metrics.classification_report(y_test, y_pred, target_names=classes))
    y_pred_prob = model.predict_proba(X_test)[::,1]

    # show classification result
    result = pd.concat([y_test.reset_index(),pd.DataFrame([y_pred,y_pred_prob]).T],axis=1)
    result.columns = ['Patient','Class','Predicted','Predicted_Prob']
    print(result)
    return model

def FoldCrossValidation(scaled_df, model): # For describing model generalization
    print('Performance of classification model before feature selection')
    cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
    X=scaled_df.iloc[:,:-1]
    print('Number of all features ', X.shape)
    y=scaled_df['Target']
    scores = cross_val_score(model, X,y, scoring='accuracy',cv=cv,n_jobs=-1)
    print('Scores: ', np.round(scores,4))
    print('Accuracy: %.3f (%.3f)'% (np.mean(scores),np.std(scores)))

    # Feature selection by using Random Forest
    importance = model.feature_importances_
    importance_score = pd.DataFrame([X.columns, importance]).T
    importance_score.columns = ['miRNA','Score']
    importance_score = importance_score.sort_values(by='Score', ascending=False)
    miRNA1 = importance_score[importance_score['Score'] > 0]['miRNA']

    print("\n",'*'*30)
    print('Performance of classification model after feature selection')
    cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)
    X=scaled_df.loc[:,miRNA1]
    print('Number of Selected features ', X.shape)
    y=scaled_df['Target']
    scores = cross_val_score(model, X,y, scoring='accuracy',cv=cv,n_jobs=-1)
    print('Scores: ', np.round(scores,4))
    print('Accuracy: %.3f (%.3f)'% (np.mean(scores),np.std(scores)))

    return miRNA1

In [None]:
base ='/content/drive/MyDrive/ML3_Noiseq_biomarker/'

In [None]:
# 1. Read Raw data
# df = pd.read_csv(base+'merge set 2and4.csv').T
df = pd.read_csv(base+'miRNA biomarker q0.7.csv').T
df = df.rename(columns=df.iloc[0]).iloc[1:,:]

label_df = pd.read_csv(base+'Factors set2and4 (3group).csv').iloc[:,:3]
label_df['Type'] = label_df['Type'].str.replace(' ','')
label_df['Class'] = label_df['Type']
label_df = label_df.set_index('Sample Name')

rawData = pd.concat([df,label_df['Class']], axis=1).fillna(0)
data = rawData

# 2. Show data description
print('\n', '2. Data Description')
DataDescription(data=data)
scaled_df = Normalization(data=data)
print('\n')




 2. Data Description
        Count  Percentage
Class                    
AIS         8      11.111
HSIL       59      81.944
Normal      5       6.944




In [None]:
# 3. Train and Test model 
# Train and Test spliting with 0.67 and 0.33 respectively
print('\n', '3. Performance of Model Classification')
X_train, X_test, y_train, y_test = train_test_split(scaled_df.iloc[:,:-1], scaled_df['Target'], 
                                                    test_size=0.33) # , random_state=7
model=RandomForestClassifier() # step 1: choose model/estimator for classification
model = ModelFitPrediction (X_train, X_test, y_train, y_test, model )




 3. Performance of Model Classification
Number of test set:  24
Accuracy score: 0.792
              precision    recall  f1-score   support

         AIS       0.00      0.00      0.00         2
        HSIL       0.79      1.00      0.88        19
      Normal       0.00      0.00      0.00         3

    accuracy                           0.79        24
   macro avg       0.26      0.33      0.29        24
weighted avg       0.63      0.79      0.70        24

   Patient   Class Predicted Predicted_Prob
0     048B    HSIL      HSIL           0.91
1     041A    HSIL      HSIL           0.86
2   DC049V     AIS      HSIL           0.94
3   DC055E    HSIL      HSIL           0.97
4     046A    HSIL      HSIL           0.98
5     044A  Normal      HSIL           0.56
6     050C  Normal      HSIL           0.78
7     048D    HSIL      HSIL           0.91
8     051A    HSIL      HSIL           0.83
9     041B    HSIL      HSIL           0.93
10    052A    HSIL      HSIL           0.98
11  

In [None]:
# 4. Show model generalization
print('\n', '4. Model generalization')
miRNA = FoldCrossValidation(scaled_df, model)




 4. Model generalization
Performance of classification model before feature selection
Number of all features  (72, 38)
Scores:  [0.75   0.75   1.     0.8571 0.8571 0.8571 0.8571 0.8571 0.8571 0.7143]
Accuracy: 0.836 (0.077)

 ******************************
Performance of classification model after feature selection
Number of Selected features  (72, 38)
Scores:  [0.75   0.75   0.8571 0.8571 0.8571 0.8571 0.8571 0.8571 0.8571 0.7143]
Accuracy: 0.821 (0.055)


In [None]:
# 5. Generate model with Feature selection
print('\n', '5. Performance of Model Classification after Feature selection')
X_train = X_train.loc[:,miRNA]
X_test = X_test.loc[:,miRNA]
modelSelection = ModelFitPrediction (X_train, X_test, y_train, y_test, model )


 5. Performance of Model Classification after Feature selection
Number of test set:  24
Accuracy score: 0.792
              precision    recall  f1-score   support

         AIS       0.00      0.00      0.00         2
        HSIL       0.79      1.00      0.88        19
      Normal       0.00      0.00      0.00         3

    accuracy                           0.79        24
   macro avg       0.26      0.33      0.29        24
weighted avg       0.63      0.79      0.70        24

   Patient   Class Predicted Predicted_Prob
0     048B    HSIL      HSIL           0.93
1     041A    HSIL      HSIL           0.88
2   DC049V     AIS      HSIL           0.88
3   DC055E    HSIL      HSIL           0.95
4     046A    HSIL      HSIL            1.0
5     044A  Normal      HSIL           0.69
6     050C  Normal      HSIL           0.83
7     048D    HSIL      HSIL           0.95
8     051A    HSIL      HSIL           0.93
9     041B    HSIL      HSIL           0.95
10    052A    HSIL      

In [None]:
# 6. 3 times Upsampling on AIS and Normal group

up_df = pd.DataFrame()
for c in ['AIS', 'Normal']:
    sampling = data[data['Class'] == c]
    up_sampling = resample(sampling, replace=True, n_samples=sampling.shape[0]*3, random_state=123)
    up_df = up_df.append(up_sampling)

hsil  = data[data['Class']== 'HSIL']

new_data = hsil.append(up_df)

# 7. Show data description
DataDescription(data=new_data)
scaled_df = Normalization(data=new_data)
print('\n')


# 8. Train and Test model 
# Train and Test spliting with 0.67 and 0.33 respectively
X_train, X_test, y_train, y_test = train_test_split(scaled_df.iloc[:,:-1], scaled_df['Target'], 
                                                    test_size=0.33) # , random_state=7
model=RandomForestClassifier() # step 1: choose model/estimator for classification
model = ModelFitPrediction (X_train, X_test, y_train, y_test, model )

# 9. Show model generalization
print('\n', 'Model generalization')
miRNA = FoldCrossValidation(scaled_df, model)

# 10. Generate model with Feature selection
print('\n', 'Model with Feature selection')
X_train = X_train.loc[:,miRNA]
X_test = X_test.loc[:,miRNA]
modelSelection = ModelFitPrediction (X_train, X_test, y_train, y_test, model )

        Count  Percentage
Class                    
AIS        24      24.490
HSIL       59      60.204
Normal     15      15.306


Number of test set:  33
Accuracy score: 0.879
              precision    recall  f1-score   support

         AIS       1.00      0.62      0.77         8
        HSIL       0.88      0.96      0.92        24
      Normal       0.50      1.00      0.67         1

    accuracy                           0.88        33
   macro avg       0.79      0.86      0.79        33
weighted avg       0.90      0.88      0.88        33

   Patient   Class Predicted Predicted_Prob
0     039A    HSIL      HSIL           0.77
1   DC049E     AIS      HSIL            0.7
2     048C    HSIL      HSIL           0.94
3     043D    HSIL    Normal           0.35
4     054C    HSIL      HSIL           0.62
5   DC055E    HSIL      HSIL           0.86
6   DC054E    HSIL      HSIL           0.72
7     047C    HSIL      HSIL           0.83
8   DC049C     AIS      HSIL           0.87
9

In [None]:
# output = base + 'Importance_Feature.csv'
# miRNA.to_csv(output)