In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFECV
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load and preprocess data
data = pd.read_csv("/kaggle/input/toxicity/data.csv")
X = data.drop('Class', axis=1) if 'Class' in data.columns else data.iloc[:, :-1]
y = data['Class'] if 'Class' in data.columns else data.iloc[:, -1]
y_binary = (y == 'NonToxic').astype(int)

In [3]:
# Shuffle and split
np.random.seed(42)
shuffle_idx = np.random.permutation(len(X))
X_shuffled, y_shuffled = X.iloc[shuffle_idx].reset_index(drop=True), y_binary.iloc[shuffle_idx].reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(X_shuffled, y_shuffled, test_size=0.2, random_state=42, stratify=y_shuffled)

In [4]:
# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(penalty=None, max_iter=2000, solver='lbfgs'),
    'Ridge LR (L2)': LogisticRegression(penalty='l2', max_iter=2000, solver='lbfgs', C=1.0),
    'Lasso LR (L1)': LogisticRegression(penalty='l1', max_iter=2000, solver='saga', C=1.0),
    'Elastic Net LR': LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=2000, C=1.0),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'XGBoost': XGBClassifier(random_state=42, scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]), max_depth=3, n_estimators=100),
    'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
    'KNeighbors': KNeighborsClassifier(n_neighbors=5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=1000, random_state=42, early_stopping=True, solver='lbfgs')
}

In [6]:
# Evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, use_rfecv=False):
    # Optional RFECV for feature selection
    if use_rfecv:
        rfecv = RFECV(model, step=1, cv=5, scoring='accuracy', n_jobs=-1)
        rfecv.fit(X_train, y_train)
        X_train_selected = rfecv.transform(X_train)
        X_test_selected = rfecv.transform(X_test)
        model.fit(X_train_selected, y_train)
    else:
        X_train_selected, X_test_selected = X_train, X_test
        model.fit(X_train_selected, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_selected)
    y_test_pred = model.predict(X_test_selected)
    
    # Probabilities
    if hasattr(model, 'predict_proba'):
        y_train_proba = model.predict_proba(X_train_selected)[:, 1]
        y_test_proba = model.predict_proba(X_test_selected)[:, 1]
    else:
        y_train_proba = model.decision_function(X_train_selected)
        y_test_proba = model.decision_function(X_test_selected)
    
    return {
        'train_acc': accuracy_score(y_train, y_train_pred),
        'test_acc': accuracy_score(y_test, y_test_pred),
        'train_auc': roc_auc_score(y_train, y_train_proba),
        'test_auc': roc_auc_score(y_test, y_test_proba),
        'precision': precision_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred),
        'f1': f1_score(y_test, y_test_pred)
    }



In [7]:
# Train and evaluate all models
results = []
for name, model in models.items():
    print(f"Training {name}...")
    # Use RFECV only for linear models (optional)
    use_rfecv = 'LR' in name or name == 'Logistic Regression'
    metrics = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, use_rfecv=False)
    metrics['model'] = name
    results.append(metrics)
    print(f"  Test Accuracy: {metrics['test_acc']:.4f}, Test AUC: {metrics['test_auc']:.4f}")

Training Logistic Regression...
  Test Accuracy: 0.5714, Test AUC: 0.5871
Training Ridge LR (L2)...
  Test Accuracy: 0.5714, Test AUC: 0.5644
Training Lasso LR (L1)...
  Test Accuracy: 0.6000, Test AUC: 0.5455
Training Elastic Net LR...
  Test Accuracy: 0.6000, Test AUC: 0.5417
Training Decision Tree...
  Test Accuracy: 0.6000, Test AUC: 0.5852
Training XGBoost...
  Test Accuracy: 0.5429, Test AUC: 0.5303
Training SVM (RBF)...
  Test Accuracy: 0.6857, Test AUC: 0.3826
Training KNeighbors...
  Test Accuracy: 0.6571, Test AUC: 0.7027
Training Neural Network...
  Test Accuracy: 0.6286, Test AUC: 0.5360


In [8]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df[['model', 'train_acc', 'test_acc', 'train_auc', 'test_auc', 'precision', 'recall', 'f1']]
results_df = results_df.sort_values('test_acc', ascending=False).reset_index(drop=True)

print("\n=== MODEL COMPARISON ===")
print(results_df.to_string(index=False))


=== MODEL COMPARISON ===
              model  train_acc  test_acc  train_auc  test_auc  precision   recall       f1
          SVM (RBF)   0.727941  0.685714   0.010501  0.382576   0.685714 1.000000 0.813559
         KNeighbors   0.713235  0.657143   0.757021  0.702652   0.714286 0.833333 0.769231
     Neural Network   1.000000  0.628571   1.000000  0.535985   0.739130 0.708333 0.723404
      Lasso LR (L1)   0.992647  0.600000   0.999267  0.545455   0.708333 0.708333 0.708333
     Elastic Net LR   0.992647  0.600000   1.000000  0.541667   0.708333 0.708333 0.708333
      Decision Tree   1.000000  0.600000   1.000000  0.585227   0.750000 0.625000 0.681818
Logistic Regression   1.000000  0.571429   1.000000  0.587121   0.714286 0.625000 0.666667
      Ridge LR (L2)   1.000000  0.571429   1.000000  0.564394   0.695652 0.666667 0.680851
            XGBoost   1.000000  0.542857   1.000000  0.530303   0.642857 0.750000 0.692308


In [9]:
# ===== FEATURE IMPORTANCE ANALYSIS =====
from sklearn.inspection import permutation_importance

# Original study's important features
original_features = ['MDEC-23', 'MATS2v', 'ATSC8s', 'VE3_Dt', 'CrippenMR', 'SpMax7_Bhe',
                     'SpMin1_Bhs', 'C1SP2', 'GATS8e', 'GATS8s', 'SpMax5_Bhv', 'VE3_Dzi', 'VPC-4']

feature_names = X.columns.tolist()

def get_feature_importance(model, model_name, X_train, X_test, y_train, y_test):
    """Extract feature importance for different model types"""
    
    # Tree-based models: use built-in feature_importances_
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        method = "Built-in (Impurity-based)"
    
    # Linear models: use absolute coefficient values
    elif hasattr(model, 'coef_'):
        importances = np.abs(model.coef_[0])
        method = "Coefficients"
    
    # Other models: use permutation importance
    else:
        perm_importance = permutation_importance(
            model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
        )
        importances = perm_importance.importances_mean
        method = "Permutation"
    
    # Create DataFrame with feature importance
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    return importance_df, method

# Extract feature importance for each trained model
print("\n" + "="*80)
print("FEATURE IMPORTANCE COMPARISON WITH ORIGINAL STUDY")
print("="*80)
print(f"\nOriginal study identified {len(original_features)} important features using DTC:")
print(original_features)
print("\n" + "-"*80)


FEATURE IMPORTANCE COMPARISON WITH ORIGINAL STUDY

Original study identified 13 important features using DTC:
['MDEC-23', 'MATS2v', 'ATSC8s', 'VE3_Dt', 'CrippenMR', 'SpMax7_Bhe', 'SpMin1_Bhs', 'C1SP2', 'GATS8e', 'GATS8s', 'SpMax5_Bhv', 'VE3_Dzi', 'VPC-4']

--------------------------------------------------------------------------------


In [10]:

feature_comparison = {}

for name, model in models.items():
    print(f"\n### {name} ###")
    
    # Get feature importance
    importance_df, method = get_feature_importance(model, name, X_train_scaled, X_test_scaled, y_train, y_test)
    
    # Get top 13 features (same number as original study)
    top_13 = importance_df.head(13)
    top_13_features = top_13['feature'].tolist()
    
    # Calculate overlap with original study
    overlap = set(top_13_features) & set(original_features)
    overlap_count = len(overlap)
    overlap_pct = (overlap_count / len(original_features)) * 100
    
    print(f"Method: {method}")
    print(f"\nTop 13 Features:")
    print(top_13.to_string(index=False))
    print(f"\nOverlap with original study: {overlap_count}/{len(original_features)} ({overlap_pct:.1f}%)")
    if overlap:
        print(f"Matching features: {sorted(overlap)}")
    
    feature_comparison[name] = {
        'top_13': top_13_features,
        'overlap_count': overlap_count,
        'overlap_features': sorted(overlap),
        'method': method
    }


### Logistic Regression ###
Method: Coefficients

Top 13 Features:
        feature  importance
           JGI7    4.427035
       maxsssCH    4.049126
        nHBint3    3.663130
      topoShape    3.568342
         ALogp2    3.528732
         WTPT-2    3.461760
PetitjeanNumber    3.400879
          C3SP3    3.381391
      minHBint4    3.334651
      maxHBint4    3.254575
        minssNH    3.167231
        maxssNH    3.122925
        nF6Ring    3.050619

Overlap with original study: 0/13 (0.0%)

### Ridge LR (L2) ###
Method: Coefficients

Top 13 Features:
  feature  importance
   ALogp2    0.485300
 BCUTw-1l    0.450650
 maxsssCH    0.418850
    minsF    0.418486
    maxsF    0.407373
    nBase    0.396368
  minssNH    0.381735
   MATS1s    0.373555
  maxssNH    0.368521
maxHBint5    0.357965
  VE3_Dzs    0.346970
    C3SP3    0.345447
minHCsatu    0.336533

Overlap with original study: 0/13 (0.0%)

### Lasso LR (L1) ###
Method: Coefficients

Top 13 Features:
  feature  importance
  

In [11]:
# Summary comparison table
print("\n" + "="*80)
print("SUMMARY: OVERLAP WITH ORIGINAL STUDY")
print("="*80)
summary_df = pd.DataFrame({
    'Model': list(feature_comparison.keys()),
    'Overlap Count': [v['overlap_count'] for v in feature_comparison.values()],
    'Overlap %': [(v['overlap_count']/13)*100 for v in feature_comparison.values()],
    'Method': [v['method'] for v in feature_comparison.values()]
}).sort_values('Overlap Count', ascending=False)

print(summary_df.to_string(index=False))



SUMMARY: OVERLAP WITH ORIGINAL STUDY
              Model  Overlap Count  Overlap %                    Method
     Neural Network              1   7.692308               Permutation
Logistic Regression              0   0.000000              Coefficients
      Ridge LR (L2)              0   0.000000              Coefficients
      Lasso LR (L1)              0   0.000000              Coefficients
     Elastic Net LR              0   0.000000              Coefficients
      Decision Tree              0   0.000000 Built-in (Impurity-based)
            XGBoost              0   0.000000 Built-in (Impurity-based)
          SVM (RBF)              0   0.000000               Permutation
         KNeighbors              0   0.000000               Permutation


In [12]:

# Find features commonly selected across multiple models
print("\n" + "="*80)
print("FEATURES SELECTED BY MULTIPLE MODELS (in top 13)")
print("="*80)

all_top_features = []
for comp in feature_comparison.values():
    all_top_features.extend(comp['top_13'])

feature_counts = pd.Series(all_top_features).value_counts()
frequent_features = feature_counts[feature_counts >= 3]  # Selected by at least 3 models

if len(frequent_features) > 0:
    print(f"\nFeatures selected by 3+ models:")
    for feat, count in frequent_features.items():
        in_original = "✓" if feat in original_features else " "
        print(f"  [{in_original}] {feat}: {count}/{len(models)} models")
else:
    print("No features were consistently selected across 3+ models")



FEATURES SELECTED BY MULTIPLE MODELS (in top 13)

Features selected by 3+ models:
  [ ] minssNH: 4/9 models
  [ ] ALogp2: 4/9 models
  [ ] C3SP3: 4/9 models
  [ ] maxssNH: 4/9 models
  [ ] JGI7: 3/9 models
  [ ] maxsssCH: 3/9 models
  [ ] minHCsatu: 3/9 models
  [ ] MATS1s: 3/9 models
  [ ] minsF: 3/9 models
  [ ] nBase: 3/9 models
  [ ] maxHBint4: 3/9 models
  [ ] minHBint4: 3/9 models


In [13]:
# Save detailed comparison
comparison_results = []
for model_name, comp in feature_comparison.items():
    for i, feat in enumerate(comp['top_13'], 1):
        comparison_results.append({
            'model': model_name,
            'rank': i,
            'feature': feat,
            'in_original_study': feat in original_features
        })

comparison_df = pd.DataFrame(comparison_results)
comparison_df

Unnamed: 0,model,rank,feature,in_original_study
0,Logistic Regression,1,JGI7,False
1,Logistic Regression,2,maxsssCH,False
2,Logistic Regression,3,nHBint3,False
3,Logistic Regression,4,topoShape,False
4,Logistic Regression,5,ALogp2,False
...,...,...,...,...
112,Neural Network,9,maxsF,False
113,Neural Network,10,JGI4,False
114,Neural Network,11,ETA_BetaP_ns_d,False
115,Neural Network,12,ETA_Beta_ns_d,False
