In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score
import pandas as pd

# Load data
data = 'data_set.csv'
df = pd.read_csv(data, header=0)
X = df.drop(['ID_REF', '!Sample_source_name_ch1', 'Accession', 'Tissue Group', 'Tissue Group ID', 'Gender', 'Individual', 'Tissue', 'Disease state'], axis=1)
y = df['Disease state']

# Encoding categorical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)  # Convert back to DataFrame to keep column names

# Data balancing with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded) # Use the encoded target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Feature selection using RFECV with XGBoost
xgb_model = XGBClassifier(random_state=42)
rfecv = RFECV(estimator=xgb_model, step=1, cv=StratifiedKFold(10), scoring='accuracy', min_features_to_select=5)
X_train_selected = rfecv.fit_transform(X_train, y_train)
X_test_selected = rfecv.transform(X_test)

# Print selected features
selected_features = X.columns[rfecv.support_]
print(f"Selected Features: {selected_features}")

# Train XGBoost model on selected features
xgb_model.fit(X_train_selected, y_train)

# Get feature importances (hub genes)
feature_importances = pd.Series(xgb_model.feature_importances_, index=selected_features).sort_values(ascending=False)

# Print top hub genes
print('Top Hub Genes:')
print(feature_importances.head(10))

# Ensemble model using XGBoost as a base classifier
lr = LogisticRegression(max_iter=1000)
svm = SVC(probability=True)
ensemble = VotingClassifier(estimators=[('lr', lr), ('xgb', xgb_model), ('svm', svm)], voting='soft')
ensemble.fit(X_train_selected, y_train)

# Predict
y_pred = ensemble.predict(X_test_selected)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Model Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Selected Features: Index(['ILMN_1343291', 'ILMN_1343295', 'ILMN_1651232', 'ILMN_1651237',
       'ILMN_1651268', 'ILMN_1651281', 'ILMN_1651282', 'ILMN_1651288',
       'ILMN_1651292', 'ILMN_1651303', 'ILMN_1651325', 'ILMN_1651336',
       'ILMN_1651358', 'ILMN_1651364', 'ILMN_1651373', 'ILMN_1651378',
       'ILMN_1651395', 'ILMN_1651403', 'ILMN_1651430', 'ILMN_1651433',
       'ILMN_1651447', 'ILMN_1651464', 'ILMN_1651490', 'ILMN_1651496',
       'ILMN_1651498', 'ILMN_1651557', 'ILMN_1651567', 'ILMN_1651569',
       'ILMN_1651574', 'ILMN_1651576', 'ILMN_1651599', 'ILMN_1651606',
       'ILMN_1651656', 'ILMN_1651672', 'ILMN_1651681', 'ILMN_1651699',
       'ILMN_1651715', 'ILMN_1651719', 'ILMN_1651735', 'ILMN_1651776',
       'ILMN_1651789', 'ILMN_1651792', 'ILMN_1651799', 'ILMN_1651817',
       'ILMN_1651838', 'ILMN_1651848', 'ILMN_1651878', 'ILMN_1651958',
       'ILMN_1652128', 'ILMN_1652164', 'ILMN_1652181', 'ILMN_1652218',
       'ILMN_1652313', 'ILMN_1652333', 'ILMN_1652412', 'Ag

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Accuracy: 0.8081
Recall: 0.8081
F1 Score: 0.8078
