In [2]:
%cd ..

import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from xgboost.plotting import plot_importance


import matplotlib.pyplot as plt

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

pd.set_option('display.max_columns', None)

/run/media/nazif/2F946E411BA61D49/thesis


In [3]:
def scale_columns(df, cols):

    # Create a scaler object
    scaler = StandardScaler()

    # Scale the specified columns in the dataframe
    df_scaled = df.copy()
    df_scaled[cols] = scaler.fit_transform(df[cols])

    return df_scaled


def report_performance(model, X, y):
    # Make predictions on the input data
    y_pred = model.predict(X)

    # Calculate performance metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred)

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC AUC': roc_auc,
    }


def get_feature_importances(model):
    """
    Get feature importances of every column from a model.

    Args:
        model: The trained model.

    Returns:
        dict: A dictionary containing the feature importances for each column, with column names as keys.
    """
    feature_importances = {}

    # Check if the model has the attribute "feature_importances_"
    if hasattr(model, "feature_importances_"):
        # Get the feature importances
        importances = model.feature_importances_

        # Get the column names
        if hasattr(model, "get_booster"):
            column_names = model.get_booster().feature_names
        elif hasattr(model, "named_steps"):
            column_names = model.named_steps["preprocessor"].get_feature_names_out(
            )
        else:
            column_names = []

        # Store the feature importances with their respective column names
        for feature_name, importance in zip(column_names, importances):
            feature_importances[feature_name] = importance

        # Sort the feature importances in descending order
        feature_importances = dict(
            sorted(feature_importances.items(), key=lambda x: x[1], reverse=True))

    return feature_importances


def compare_importances(model1_importances, model2_importances):
    # Convert dictionaries to DataFrames
    df1 = pd.DataFrame(model1_importances.items(),
                       columns=['Feature', 'Model 1'])
    df2 = pd.DataFrame(model2_importances.items(),
                       columns=['Feature', 'Model 2'])

    # Merge the DataFrames on the 'Feature' column
    merged_df = pd.merge(df1, df2, on='Feature', how='outer')

    # Calculate change and add indicators
    merged_df['Change'] = merged_df.apply(
        lambda row: '+' if row['Model 2'] > row['Model 1'] else ('-' if row['Model 2'] < row['Model 1'] else ''), axis=1)
    merged_df['Importance Change'] = merged_df['Model 2'] - \
        merged_df['Model 1']

    # Sort by Importance Change descending
    merged_df = merged_df.sort_values(by='Importance Change', ascending=False)

    return merged_df.style.apply(
        lambda row: [
            'color:black; background-color: #98ff98'
            if val == '+'
            else (
                'color:black; background-color: #8B0000' if val == '-' else ''
            )
            for val in row
        ],
        axis=1,
    )


def compare_metrics(model1_metrics, model2_metrics):
    # Convert dictionaries to DataFrames
    df1 = pd.DataFrame(model1_metrics.items(), columns=['Metric', 'Model 1'])
    df2 = pd.DataFrame(model2_metrics.items(), columns=['Metric', 'Model 2'])

    # Merge the DataFrames on the 'Metric' column
    merged_df = pd.merge(df1, df2, on='Metric', how='outer')

    # Calculate change and add indicators
    merged_df['Change'] = merged_df.apply(
        lambda row: '+' if row['Model 2'] > row['Model 1'] else ('-' if row['Model 2'] < row['Model 1'] else ''), axis=1)
    merged_df['Change In Value'] = merged_df['Model 2'] - merged_df['Model 1']

    return merged_df.style.apply(
        lambda row: [
            'color:black; background-color: #98ff98'
            if val == '+'
            else (
                'color:black; background-color: #8B0000' if val == '-' else ''
            )
            for val in row
        ],
        axis=1,
    )


def drop_column_and_score(X, y):
    # List of metrics to calculate
    metrics = {
        'Accuracy': accuracy_score,
        'Precision': precision_score,
        'Recall': recall_score,
        'F1 Score': f1_score
    }

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)

    # Train an initial XGBoost model
    model = XGBClassifier()
    model.fit(X_train, y_train)
    initial_scores = calculate_metrics(metrics, y_test, model.predict(X_test))
    scores = {'Initial Model': initial_scores}
    # Drop each column one by one and train a new model
    for column in X.columns:
        # Create a new X without the current column
        X_dropped = X.drop(column, axis=1)

        # Split the modified data into train and test sets
        X_train_dropped, X_test_dropped, y_train, y_test = train_test_split(
            X_dropped, y, test_size=0.2, random_state=42, stratify=y)

        # Train a new XGBoost model without the current column
        model_dropped = XGBClassifier()
        model_dropped.fit(X_train_dropped, y_train)

        # Calculate the scores of the new model
        column_scores = calculate_metrics(
            metrics, y_test, model_dropped.predict(X_test_dropped))
        scores[f'Dropped {column}'] = column_scores

    return pd.DataFrame(scores).T


def calculate_metrics(metrics, y_true, y_pred):
    scores = {}
    for metric_name, metric_func in metrics.items():
        score = metric_func(y_true, y_pred)
        scores[metric_name] = score
    return scores


def stylizer(df):
    # Define a function to apply color based on cell values
    def apply_color(row):
        color = []  # List to store color values for each cell in the row
        for i, cell in enumerate(row):
            # Check if the cell value is equal to the first cell value in the DataFrame
            if cell == df.iloc[0, i]:
                color.append('background-color: black')  # If equal, set background color to black
            else:
                # Calculate the difference between the cell value and the first cell value
                diff = cell - df.iloc[0, i]
                # Calculate the maximum difference in the DataFrame
                max_diff = df.values.max() - df.values.min()
                if diff > 0:
                    # If the difference is positive, calculate the intensity of green color based on the difference
                    intensity = min(1.0, 0.2 + 0.8 * (diff / max_diff))
                    color.append(f'background-color: rgba(0, 255, 0, {intensity:.2f})')  # Green with intensity
                else:
                    # If the difference is negative, calculate the intensity of red color based on the difference
                    intensity = min(1.0, 0.2 - 0.8 * (diff / max_diff))
                    color.append(f'background-color: rgba(255, 0, 0, {intensity:.2f})')  # Red with intensity
        return color

    # Apply the color function to each row of the DataFrame using the `.style.apply()` method
    return df.style.apply(apply_color, axis=1)




def find_most_correlated_features(data, threshold=0.7):
    # Compute the correlation matrix
    correlation_matrix = data.corr().abs()

    # Extract the upper triangle of the correlation matrix
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

    # Find feature pairs with correlation above the threshold
    correlated_pairs = upper_triangle.unstack().sort_values(ascending=False)
    correlated_pairs = correlated_pairs[correlated_pairs > threshold]

    return correlated_pairs


def get_average_correlations(df: pd.DataFrame) -> pd.Series:
    return df.corr().abs().mean()


In [4]:
df = pd.read_csv("results/5_features.csv")


In [5]:
cols_to_keep = [
    "pred_energy",
    "pred_num_basepairs",
    "pred_seed_basepairs",
    "ta_log10",
    "sps_mean",
    "anchor_a",
    "6mer_seed",
    "match_8",
    "6mer_seed_1_mismatch",
    "compensatory_site",
    "supplementary_site",
    "supplementary_site_2",
    "empty_seed",
    "9_consecutive_match_anywhere",
    "mirna_conservation",


    "seed_8mer",
    "seed_7mer_a1",
    "seed_7mer_m8",
    "seed_compensatory",
    "seed_clash_2",
    "seed_clash_3",
    "seed_clash_4",
    "seed_clash_5",
    "mre_au_content",
    "au_content",
    "label"
]

df = df[cols_to_keep]

In [6]:
df.head()

Unnamed: 0,pred_energy,pred_num_basepairs,pred_seed_basepairs,ta_log10,sps_mean,anchor_a,6mer_seed,match_8,6mer_seed_1_mismatch,compensatory_site,supplementary_site,supplementary_site_2,empty_seed,9_consecutive_match_anywhere,mirna_conservation,seed_8mer,seed_7mer_a1,seed_7mer_m8,seed_compensatory,seed_clash_2,seed_clash_3,seed_clash_4,seed_clash_5,mre_au_content,au_content,label
0,-27.7,20,6,3.393,-8.18,1,1,1,0,1,1,0,0,1,2.0,1,0,0,0,1,0,0,0,0.590909,0.670732,1
1,-26.2,17,6,3.393,-8.18,1,1,1,0,1,1,0,0,1,2.0,1,0,0,0,1,0,0,0,0.454545,0.634146,1
2,-23.6,19,6,3.393,-8.18,0,1,0,0,1,1,1,0,1,2.0,0,0,0,0,0,0,0,0,0.5,0.585366,1
3,-23.8,19,6,3.393,-8.18,1,1,0,0,1,1,1,0,1,2.0,0,1,0,0,0,0,0,0,0.636364,0.621951,1
4,-24.0,18,6,3.393,-8.18,0,1,1,0,0,0,0,0,1,2.0,0,0,1,0,0,0,0,0,0.5,0.585366,1


# scaling columns (disabled)

In [7]:
# # scaling columns
# cols_to_scale = ["pred_energy", "ta_log10", "sps_mean"]
# df = scale_columns(df, cols_to_scale)

In [8]:
# # scale midpoint using minmax
# minmax = MinMaxScaler(feature_range=(0, 1))
# df["midpoint"] = minmax.fit_transform(df["midpoint"].values.reshape(-1, 1))


# df.head()

In [9]:
# Train test split
X = df.drop("label", axis=1)
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
model = XGBClassifier(objective="binary:logistic")
model.fit(X_train, y_train)
report = report_performance(model, X_test, y_test)
importances = get_feature_importances(model)

In [12]:
model.feature_importances_

array([0.5925099 , 0.02779814, 0.01254048, 0.04486499, 0.03184418,
       0.00669375, 0.        , 0.00801963, 0.00535922, 0.00789661,
       0.00517622, 0.00598456, 0.03774297, 0.00715259, 0.13452187,
       0.01029878, 0.00493244, 0.0097351 , 0.00119641, 0.0011681 ,
       0.00289158, 0.00177233, 0.00510955, 0.02800726, 0.00678322],
      dtype=float32)

In [11]:
col_drop_results = drop_column_and_score(X, y)


KeyboardInterrupt: 

In [None]:
stylized_df = stylizer(col_drop_results)
stylized_df.set_caption("Model")
stylized_df

In [None]:
correlation_matrix = X.corr()

plt.figure(figsize=(20, 18))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
find_most_correlated_features(X)

In [None]:
# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(list(importances.keys()), list(importances.values()))
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()


In [None]:

avgcorr = get_average_correlations(X)

avgcorr = avgcorr.sort_values(ascending=False)
plt.figure(figsize=(10, 6))
plt.bar(avgcorr.index, avgcorr.values)
plt.xlabel('Features')
plt.ylabel('Average Correlation')
plt.title('Average Correlations of Features (Descending Order)')
plt.xticks(rotation=90)
plt.show()

In [None]:

# # Define the hyperparameter grid for grid search
# param_grid = {
#     'learning_rate': [0.1, 0.01, 0.001],                # Default: 0.3
#     'n_estimators': [100, 300, 500],                    # Default: 100
#     'max_depth': [3, 6, 9],                            # Default: 6
#     'min_child_weight': [1, 3, 5],                     # Default: 1
#     'subsample': [0.8, 0.9, 1.0],                     # Default: 1.0
#     'colsample_bytree': [0.8, 0.9, 1.0],              # Default: 1.0
#     'reg_lambda': [1.0, 2.0, 3.0],                    # Default: 1.0
#     'reg_alpha': [0.0, 0.1, 0.5],                     # Default: 0.0
#     'gamma': [0, 0.1, 0.2],                           # Default: 0
# }

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters and model
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# print("Best Hyperparameters:", best_params)
# print("Best Model:", best_model)

# # Evaluate the best model on the testing set
# accuracy = best_model.score(X_test, y_test)  # Replace X_test and y_test with your testing data
# print("Accuracy on Testing Set:", accuracy)

In [None]:
# model_after_cv = xgb.XGBClassifier(**grid_search.best_params_)

# model_after_cv.fit(X_train, y_train)
# score_after_cv = model_after_cv.score(X_test, y_test)
# print(f"accuracy after cv: {score_after_cv}")

In [None]:
# percentage_increase = (model_after_cv.score(X_test, y_test) - model.score(X_test, y_test)) / model.score(X_test, y_test) * 100

# print(f"The accuracy increased by {percentage_increase:.2f}% after CV")


In [None]:

# Save the trained model
model.save_model('results/model.xgb')
