In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [8]:
# Load the dataset from an Excel file
df = pd.read_excel('/kaggle/input/med-cs-virus/resistance_dataset.xlsx')

# Rename the target column for clarity
df.rename({'2F5 Resistance Binary outcom': 'BinaryOutcome'}, axis=1, inplace=True)

In [9]:
# Display the first few rows of the dataframe
df_head = df.head(3)

In [10]:
# Create an array of sequence positions
sequence_positions = np.arange(1, 857)

# Find the mode values for each sequence position
mode_values = df[sequence_positions].mode().iloc[0]

# Replace '-' values with the mode for each sequence position
for column in sequence_positions:
    df[column].replace('-', mode_values[column], inplace=True)

# Identify columns with all '-' values and drop them
null_columns = mode_values[mode_values == '-'].index.to_list()
df.drop(columns=null_columns, inplace=True)

In [11]:
# Function to optimize an XGBoost classifier using GridSearchCV
def optimize_xgb_classifier(X, y):
    # Define the hyperparameters to optimize
    hyperparameters = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
    }

    # Create a GridSearchCV object
    xgb_model = XGBClassifier()
    grid_search = GridSearchCV(xgb_model, hyperparameters, scoring='accuracy', n_jobs=-1)

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X, y)

    # Return the best XGBClassifier model
    return grid_search.best_estimator_

In [12]:
# Create a copy of the dataframe for modeling
model_df = df.copy()

# Drop the 'Virus name' column as it's not needed for modeling
model_df.drop('Virus name', axis=1, inplace=True)

# Identify categorical columns based on available positions
categorical_columns = list(set(sequence_positions) - set(null_columns))

# Convert categorical columns to numerical using Label Encoding
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    model_df[column] = le.fit_transform(model_df[column])
    label_encoders[column] = le

# Split the data into features (X) and target (y)
X = model_df.drop('BinaryOutcome', axis=1)
y = model_df['BinaryOutcome']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optimize and train the XGBoost model
best_xgb_model = optimize_xgb_classifier(X_train, y_train)



In [13]:
# Evaluate the model on the test set
accuracy = best_xgb_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.98125


In [14]:
# Find the most important feature based on its importance score
most_important_feature_index = best_xgb_model.feature_importances_.max()

# Print the index of the most important feature
print("Most important position with highest association is :", most_important_feature_index)

Most important position with highest association is : 0.076215856


In [17]:
# Find the indices of features sorted by importance
sorted_feature_indices = np.argsort(best_xgb_model.feature_importances_)[::-1]

# Sort the feature names and importance values based on sorted indices
sorted_feature_names = X.columns[sorted_feature_indices]
sorted_feature_importance = best_xgb_model.feature_importances_[sorted_feature_indices]

# Create a DataFrame to store the sorted feature names and importance values
sorted_feature_df = pd.DataFrame({
    'FeatureName': sorted_feature_names,
    'Importance': sorted_feature_importance
})

In [16]:
# Plot the feature importance scores using Plotly
fig = px.bar(
    x=sorted_feature_df['FeatureName'],
    y=sorted_feature_df['Importance'],
    labels={'x': 'Feature', 'y': 'Importance'},
    title='Feature Importance Scores',
)

# Customize the layout
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title="Feature",
    yaxis_title="Importance",
)

# Show the Plotly figure
fig.show()