In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE

In [2]:
# Read the label data from a text file (tab-separated) into a DataFrame
labels_df = pd.read_csv('label_755.txt',sep='\t',header=None)

# Rename the columns to 'sample' for the sample ID and 'label' for the target variable
labels_df.columns=['sample','label']

# Replace 'R' with 1 and 'NR' with 0 in the 'label' column for binary classification
labels_df.replace({'R':1,'NR':0}, inplace=True)

  labels_df.replace({'R':1,'NR':0}, inplace=True)


In [3]:
labels_df

Unnamed: 0,sample,label
0,ERR12405911,1
1,ERR12405912,1
2,ERR12405914,1
3,ERR12405915,1
4,ERR12405922,1
...,...,...
750,SRR16168977,0
751,SRR16168981,0
752,SRR16168982,0
753,SRR16168983,0


In [4]:
# Read the gene abundance data from a text file (space-separated) into a pandas DataFrame
gene_M=pd.read_csv('difgene_abundance_755_5761_train.txt',sep='\t',low_memory=False,index_col=0)
gene_M=gene_M.T
gene_M = gene_M.reset_index()
gene_M = gene_M.rename(columns={'index':'sample'})

In [5]:
gene_M

Unnamed: 0,sample,gene_2182,gene_2384,gene_2674,gene_3335,gene_3438,gene_3821,gene_4022,gene_4758,gene_4980,...,gene_4589493,gene_4590121,gene_4594170,gene_4596919,gene_4606181,gene_4617108,gene_4626946,gene_4630324,gene_4650733,gene_4658143
0,ERR12405911,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
1,ERR12405912,0.0,1.500134e-07,9.987480e-08,0.000002,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
2,ERR12405914,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
3,ERR12405915,0.0,0.000000e+00,0.000000e+00,0.000000,3.331724e-09,5.453582e-08,0.000000e+00,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
4,ERR12405922,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,7.344955e-09,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750,SRR15373195,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000006,0.0,0.0,0.0,0.0,0.000000,0.0
751,SRR15373196,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,3.344625e-08,0.0,5.821528e-08,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
752,SRR15373199,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,6.454162e-09,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000004,0.0
753,SRR15373200,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0


In [6]:
# Merge the gene abundance DataFrame (gene_M) with the labels DataFrame (labels_df) on the 'sample' column.
df=pd.merge(gene_M,labels_df,on='sample',how='inner')
labels =df['label']
X=df.drop(['sample','label'],axis=1).values

In [7]:
X.shape

(755, 5761)

In [8]:
# Optimized the 'n_estimators' parameter
param_test1 = {"n_estimators":range(1,600,100)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
{'n_estimators': 301}
best accuracy:0.748967


In [9]:
# Optimized the 'n_estimators' parameter
param_test1 = {"n_estimators":range(201,401,10)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'n_estimators': 261}
best accuracy:0.750597


In [10]:
# Optimized the 'n_estimators' parameter
param_test1 = {"n_estimators":range(251,271,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'n_estimators': 265}
best accuracy:0.751008


In [11]:
# Optimized the 'max_depth' parameter
param_test1 = {"max_depth":range(1,100,5)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=265, random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
#print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'max_depth': 46}
best accuracy:0.751837


In [12]:
# Optimized the 'max_depth' parameter
param_test1 = {"max_depth":range(41,51,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=265, random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
#print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
{'max_depth': 49}
best accuracy:0.752147


In [13]:
# Optimized the 'min_samples_split' parameter
param_test1 = {"min_samples_split":range(2,30,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=265, max_depth=49, random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 28 candidates, totalling 280 fits
{'min_samples_split': 2}
best accuracy:0.752147


In [14]:
# Optimized the 'max_features' parameter
param_test1 = {"max_features":range(1,100,5)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=265, max_depth=49, random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'max_features': 71}
best accuracy:0.761651


In [15]:
# Optimized the 'max_features' parameter
param_test1 = {"max_features":range(61,81,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=265, max_depth=49, random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'max_features': 69}
best accuracy:0.762508


In [16]:
# Optimized the 'min_samples_leaf' parameter
param_test1 = {"min_samples_leaf":range(1,50,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=265, max_depth=49, max_features=69, random_state=123),param_grid=param_test1, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
#print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 49 candidates, totalling 490 fits
{'min_samples_leaf': 1}
best accuracy:0.762508


In [17]:
# Create a random forest classifier
rfc = RandomForestClassifier(n_estimators=265, max_depth=49, max_features=69, random_state=123)
rfc.fit(X, labels)

In [18]:
# Read in the gene abundance matrix for the independent test samples
test_64=pd.read_csv('difgene_abundance_64_5761_test.txt',sep='\t',low_memory=False,index_col=0)
test_64=test_64.T
test_64 = test_64.reset_index()
test_64 = test_64.rename(columns={'index':'sample'})
test_64

Unnamed: 0,sample,gene_2182,gene_2384,gene_2674,gene_3335,gene_3438,gene_3821,gene_4022,gene_4758,gene_4980,...,gene_4589493,gene_4590121,gene_4594170,gene_4596919,gene_4606181,gene_4617108,gene_4626946,gene_4630324,gene_4650733,gene_4658143
0,SRR6000870,0.000000e+00,7.930000e-09,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,SRR6000871,0.000000e+00,0.000000e+00,0.000000e+00,1.930000e-08,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,SRR6000893,2.640000e-07,0.000000e+00,8.690000e-08,3.820000e-06,0.000000e+00,0.000000e+00,0.000000e+00,3.000000e-07,0.000000e+00,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,SRR6000900,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.870000e-07,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4,SRR6000901,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.040000e-08,5.430000e-09,0.000000e+00,0.000000e+00,2.780000e-09,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,ERR2162210,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.760000e-08,...,0.000000,0.0,0.000006,0.000000,0.000000,0.000028,0.000000,0.0,0.000005,0.000000
60,ERR2162213,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.330000e-09,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.000000,0.000004,0.000000,0.000000,0.000003,0.0,0.000000,0.000003
61,ERR2162215,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.300000e-09,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000021,0.0,0.000050,0.000010,0.000006,0.000000,0.000004,0.0,0.000000,0.000005
62,ERR2162218,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.690000e-08,0.000000e+00,0.000000e+00,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000028,0.0,0.000000,0.000000


In [19]:
X_test_64=test_64.drop(['sample'],axis=1).values

In [20]:
X_test_64.shape

(64, 5761)

In [21]:
# Read in the response labels for the test samples
labels_test_64 = pd.read_csv('label_64.txt',sep='\t',header=None)

# Rename the columns to 'sample' for the sample ID and 'label' for the target variable
labels_test_64.columns=['sample','label']

# Replace 'R' with 1 and 'NR' with 0 in the 'label' column for binary classification
labels_test_64.replace({'R':1,'NR':0}, inplace=True)

  labels_test_64.replace({'R':1,'NR':0}, inplace=True)


In [22]:
lab_test_64 =labels_test_64['label']
labels_test_64

Unnamed: 0,sample,label
0,SRR6000870,1
1,SRR6000871,1
2,SRR6000893,1
3,SRR6000900,1
4,SRR6000901,1
...,...,...
59,ERR2162210,0
60,ERR2162213,0
61,ERR2162215,0
62,ERR2162218,0


In [23]:
# Calculate the prediction accuracy of the test samples using the RF model
rfc.score(X_test_64, lab_test_64)

0.734375

In [24]:
# Predicted classification result for each sample
rfc.predict(X_test_64)

array([1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])