In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import pandas as pd

# Load data
data = 'data_set.csv'
df = pd.read_csv(data, header=0)
X = df.drop(['ID_REF', '!Sample_source_name_ch1', 'Accession', 'Tissue Group', 'Tissue Group ID', 'Gender', 'Individual', 'Tissue', 'Disease state'], axis=1)
y = df['Disease state']

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)  # Convert back to DataFrame to keep column names

# Data balancing with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Feature selection using RFECV with GBM
gbm_model = GradientBoostingClassifier(random_state=42)
rfecv = RFECV(estimator=gbm_model, step=1, cv=StratifiedKFold(5), scoring='accuracy')
X_train_selected = rfecv.fit_transform(X_train, y_train)
X_test_selected = rfecv.transform(X_test)

# Feature Importances and Hub Genes
gbm_model.fit(X_train_selected, y_train)  # Ensure the GBM model is refit for feature importance extraction
feature_importances = pd.Series(gbm_model.feature_importances_, index=X.columns[rfecv.support_]).sort_values(ascending=False)

# Ensemble model using GBM as a base classifier
lr = LogisticRegression(max_iter=1000)
svm = SVC(probability=True)
ensemble = VotingClassifier(estimators=[('lr', lr), ('gbm', gbm_model), ('svm', svm)], voting='soft')
ensemble.fit(X_train_selected, y_train)

# Predict
y_pred = ensemble.predict(X_test_selected)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print results
print(f'Model Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Precision: {precision:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Top 10 Hub Genes by Feature Importance:')
print(feature_importances.head(10))


Model Accuracy: 0.8182
Recall: 0.8182
Precision: 0.8167
F1 Score: 0.8169
Top 10 Hub Genes by Feature Importance:
Age             0.174338
ILMN_1651799    0.064604
ILMN_1651699    0.045618
ILMN_1651358    0.029579
ILMN_1651557    0.025893
ILMN_1651719    0.020764
ILMN_1651433    0.020287
ILMN_1651339    0.020227
ILMN_1651817    0.019007
ILMN_1651715    0.016023
dtype: float64
