In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE

In [2]:
# Read the label data from a text file (tab-separated) into a DataFrame
labels_df = pd.read_csv('label_755.txt',sep='\t',header=None)

# Rename the columns to 'sample' for the sample ID and 'label' for the target variable
labels_df.columns=['sample','label']

# Replace 'R' with 1 and 'NR' with 0 in the 'label' column for binary classification
labels_df.replace({'R':1,'NR':0}, inplace=True)

  labels_df.replace({'R':1,'NR':0}, inplace=True)


In [3]:
labels_df

Unnamed: 0,sample,label
0,ERR12405911,1
1,ERR12405912,1
2,ERR12405914,1
3,ERR12405915,1
4,ERR12405922,1
...,...,...
750,SRR16168977,0
751,SRR16168981,0
752,SRR16168982,0
753,SRR16168983,0


In [4]:
# Read the gene abundance data from a text file (space-separated) into a pandas DataFrame
gene_M=pd.read_csv('difgene_abundance_755_3522_train.txt',sep='\t',low_memory=False,index_col=0)
gene_M=gene_M.T
gene_M = gene_M.reset_index()
gene_M = gene_M.rename(columns={'index':'sample'})

In [5]:
gene_M

Unnamed: 0,sample,gene_2182,gene_2674,gene_4758,gene_4980,gene_12871,gene_21424,gene_26480,gene_28036,gene_30817,...,gene_4564884,gene_4589493,gene_4590121,gene_4594170,gene_4596919,gene_4606181,gene_4617108,gene_4626946,gene_4630324,gene_4658143
0,ERR12405911,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,6.441839e-08,2.641430e-08,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,ERR12405912,0.0,9.987480e-08,0.0,0.000000e+00,0.0,9.028681e-08,0.000000e+00,1.285298e-07,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,ERR12405914,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,6.353634e-08,0.000000e+00,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,ERR12405915,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,8.484399e-09,0.000000e+00,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,ERR12405922,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,SRR15373195,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,1.883819e-07,0.000000e+00,0.0,...,0.0,0.0,0.000000,0.0,0.000006,0.0,0.0,0.0,0.0,0.0
751,SRR15373196,0.0,0.000000e+00,0.0,5.821528e-08,0.0,0.000000e+00,1.024049e-08,0.000000e+00,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
752,SRR15373199,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,9.554159e-09,0.000000e+00,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
753,SRR15373200,0.0,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,4.408764e-06,2.819472e-08,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [6]:
# Merge the gene abundance DataFrame (gene_M) with the labels DataFrame (labels_df) on the 'sample' column.
df=pd.merge(gene_M,labels_df,on='sample',how='inner')
labels =df['label']
X=df.drop(['sample','label'],axis=1).values

In [7]:
X.shape

(755, 3522)

In [8]:
# Optimized the 'n_estimators' parameter
param_test1 = {"n_estimators":range(1,500,10)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(random_state=2345),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 50 candidates, totalling 500 fits
{'n_estimators': 321}
best accuracy:0.737890


In [9]:
# Optimized the 'n_estimators' parameter
param_test1 = {"n_estimators":range(311,331,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(random_state=2345),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'n_estimators': 320}
best accuracy:0.738178


In [10]:
# Optimized the 'max_depth' parameter
param_test1 = {"max_depth":range(1,50,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=320, random_state=2345),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
#print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 49 candidates, totalling 490 fits
{'max_depth': 38}
best accuracy:0.737298


In [11]:
# Optimized the 'min_samples_split' parameter
param_test1 = {"min_samples_split":range(2,30,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=320, random_state=2345),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 28 candidates, totalling 280 fits
{'min_samples_split': 24}
best accuracy:0.740364


In [12]:
# Optimized the 'max_features' parameter
param_test1 = {"max_features":range(1,50,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=320, min_samples_split=24, random_state=2345),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 49 candidates, totalling 490 fits
{'max_features': 2}
best accuracy:0.749940


In [13]:
# Optimized the 'min_samples_leaf' parameter
param_test1 = {"min_samples_leaf":range(1,50,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=320, min_samples_split=24, max_features=2, random_state=2345),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 49 candidates, totalling 490 fits
{'min_samples_leaf': 1}
best accuracy:0.749940


In [14]:
# Create a random forest classifier
rfc = RandomForestClassifier(n_estimators=320, min_samples_split=24, max_features=2, random_state=2345)
rfc.fit(X, labels)

In [15]:
# Read in the gene abundance matrix for the independent test samples
test_64=pd.read_csv('difgene_abundance_64_3522_test.txt',sep='\t',low_memory=False,index_col=0)
test_64=test_64.T
test_64 = test_64.reset_index()
test_64 = test_64.rename(columns={'index':'sample'})
test_64

Unnamed: 0,sample,gene_2182,gene_2674,gene_4758,gene_4980,gene_12871,gene_21424,gene_26480,gene_28036,gene_30817,...,gene_4564884,gene_4589493,gene_4590121,gene_4594170,gene_4596919,gene_4606181,gene_4617108,gene_4626946,gene_4630324,gene_4658143
0,SRR6000870,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,7.880000e-07,0.000000e+00,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,SRR6000871,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,SRR6000893,2.640000e-07,8.690000e-08,3.000000e-07,0.000000e+00,0.000000e+00,0.0,3.260000e-07,0.000000e+00,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,SRR6000900,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,1.700000e-07,0.000000e+00,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,SRR6000901,0.000000e+00,0.000000e+00,0.000000e+00,2.780000e-09,0.000000e+00,0.0,6.540000e-07,0.000000e+00,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,ERR2162210,0.000000e+00,0.000000e+00,0.000000e+00,1.760000e-08,0.000000e+00,0.0,6.500000e-07,0.000000e+00,0.0,...,0.000000,0.000000,0.0,0.000006,0.000000,0.000000,0.000028,0.000000,0.0,0.000000
60,ERR2162213,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.860000e-08,0.0,0.000000e+00,6.700000e-09,0.0,...,0.000000,0.000000,0.0,0.000000,0.000004,0.000000,0.000000,0.000003,0.0,0.000003
61,ERR2162215,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,1.260000e-08,0.000000e+00,0.0,...,0.000004,0.000021,0.0,0.000050,0.000010,0.000006,0.000000,0.000004,0.0,0.000005
62,ERR2162218,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,7.630000e-09,0.000000e+00,0.0,...,0.000006,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000028,0.0,0.000000


In [16]:
X_test_64=test_64.drop(['sample'],axis=1).values

In [17]:
X_test_64.shape

(64, 3522)

In [18]:
# Read in the response labels for the test samples
labels_test_64 = pd.read_csv('label_64.txt',sep='\t',header=None)

# Rename the columns to 'sample' for the sample ID and 'label' for the target variable
labels_test_64.columns=['sample','label']

# Replace 'R' with 1 and 'NR' with 0 in the 'label' column for binary classification
labels_test_64.replace({'R':1,'NR':0}, inplace=True)

  labels_test_64.replace({'R':1,'NR':0}, inplace=True)


In [19]:
lab_test_64 =labels_test_64['label']
labels_test_64

Unnamed: 0,sample,label
0,SRR6000870,1
1,SRR6000871,1
2,SRR6000893,1
3,SRR6000900,1
4,SRR6000901,1
...,...,...
59,ERR2162210,0
60,ERR2162213,0
61,ERR2162215,0
62,ERR2162218,0


In [20]:
# Calculate the prediction accuracy of the test samples using the RF model
rfc.score(X_test_64, lab_test_64)

0.75

In [21]:
# Predicted classification result for each sample
rfc.predict(X_test_64)

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0])