In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE

In [2]:
# Read the label data from a text file (tab-separated) into a DataFrame
labels_df = pd.read_csv('label_755.txt',sep='\t',header=None)

# Rename the columns to 'sample' for the sample ID and 'label' for the target variable
labels_df.columns=['sample','label']

# Replace 'R' with 1 and 'NR' with 0 in the 'label' column for binary classification
labels_df.replace({'R':1,'NR':0}, inplace=True)

  labels_df.replace({'R':1,'NR':0}, inplace=True)


In [3]:
# Read the gene abundance data from a text file (space-separated) into a pandas DataFrame
gene_M=pd.read_csv('difgene_abundance_755_55292_train.txt',sep='\t',low_memory=False,index_col=0)
gene_M=gene_M.T
gene_M = gene_M.reset_index()
gene_M = gene_M.rename(columns={'index':'sample'})

In [4]:
# Merge the gene abundance DataFrame (gene_M) with the labels DataFrame (labels_df) on the 'sample' column.
df=pd.merge(gene_M,labels_df,on='sample',how='inner')
labels =df['label']
X=df.drop(['sample','label'],axis=1).values
X.shape

(755, 55292)

In [5]:
# Optimized the 'penalty' parameter, default=’l2’
param_grid = {"penalty":['l1', 'l2']}
gsearch1 = GridSearchCV(LogisticRegression(solver='liblinear', random_state=105),param_grid, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
{'penalty': 'l2'}
best accuracy:0.754926


In [6]:
# Optimized the 'solver' parameter, default=’lbfgs’
param_grid = {"solver":['liblinear','sag','saga','lbfgs','newton-cg']}
gsearch1 = GridSearchCV(LogisticRegression(penalty='l2', random_state=105),param_grid, n_jobs= -1,
                        scoring='roc_auc',cv=10,verbose=1)
gsearch1.fit(X,labels)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
{'solver': 'liblinear'}
best accuracy:0.754926


In [7]:
# Create a Logistic Regression classifier model
lr=LogisticRegression(penalty='l2',solver='liblinear',random_state=105)
lr.fit(X, labels)

In [8]:
# Read in the gene abundance matrix for the independent test samples
test_64=pd.read_csv('difgene_abundance_64_55292_test.txt',sep='\t',low_memory=False,index_col=0)
test_64=test_64.T
test_64 = test_64.reset_index()
test_64 = test_64.rename(columns={'index':'sample'})
test_64

Unnamed: 0,sample,gene_3,gene_9,gene_31,gene_62,gene_70,gene_107,gene_132,gene_158,gene_173,...,gene_4662751,gene_4662822,gene_4664176,gene_4664554,gene_4665019,gene_4665056,gene_4665086,gene_4665104,gene_4666141,gene_4666219
0,SRR6000870,0.0,1.210000e-07,0.000000e+00,1.900000e-06,0.000000e+00,0.0,0.0,5.320000e-08,7.980000e-09,...,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
1,SRR6000871,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,...,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
2,SRR6000893,0.0,1.890000e-07,1.850000e-07,0.000000e+00,8.730000e-07,0.0,0.0,9.550000e-08,0.000000e+00,...,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
3,SRR6000900,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,...,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
4,SRR6000901,0.0,0.000000e+00,0.000000e+00,1.440000e-09,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,...,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,ERR2162210,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,...,0.00000,0.000008,0.000013,0.000000,0.0,0.00001,0.000000,0.000000,0.000000,0.000000
60,ERR2162213,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,...,0.00007,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000017,0.000004
61,ERR2162215,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,...,0.00000,0.000000,0.000000,0.000007,0.0,0.00000,0.000000,0.000000,0.000021,0.000005
62,ERR2162218,0.0,0.000000e+00,0.000000e+00,1.090000e-07,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00,...,0.00000,0.000025,0.000000,0.000000,0.0,0.00000,0.000007,0.000011,0.000000,0.000019


In [9]:
X_test_64=test_64.drop(['sample'],axis=1).values

In [10]:
# Read in the response labels for the test samples
labels_test_64 = pd.read_csv('label_64.txt',sep='\t',header=None)

# Rename the columns to 'sample' for the sample ID and 'label' for the target variable
labels_test_64.columns=['sample','label']

# Replace 'R' with 1 and 'NR' with 0 in the 'label' column for binary classification
labels_test_64.replace({'R':1,'NR':0}, inplace=True)

  labels_test_64.replace({'R':1,'NR':0}, inplace=True)


In [11]:
lab_test_64 =labels_test_64['label']
labels_test_64

Unnamed: 0,sample,label
0,SRR6000870,1
1,SRR6000871,1
2,SRR6000893,1
3,SRR6000900,1
4,SRR6000901,1
...,...,...
59,ERR2162210,0
60,ERR2162213,0
61,ERR2162215,0
62,ERR2162218,0


In [12]:
# Calculate the prediction accuracy of the test samples using the Logistic Regression classifier model
lr.score(X_test_64, lab_test_64)

0.453125

In [13]:
# Predicted classification result for each sample
lr.predict(X_test_64)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])