In [191]:
import numpy as np
import pandas as pd

df =  pd.read_csv('dataset/brain_stroke.csv')
df.dropna(inplace=True)

file_prefix = "plane"  # Change this to any word you like


In [193]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Identify categorical data (change this based on your actual data)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Apply Label Encoding to each categorical column
for column in categorical_cols:
    # Ensure the column is of type object (string) or category
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        df[column] = le.fit_transform(df[column])
        
df['Y'], unique = pd.factorize(df['Y'])


In [194]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,Y
0,1,67.0,0,1,1,1,1,228.69,36.6,1,0
1,1,80.0,0,1,1,1,0,105.92,32.5,2,0
2,0,49.0,0,0,1,1,1,171.23,34.4,3,0
3,0,79.0,1,0,1,2,0,174.12,24.0,2,0
4,1,81.0,0,0,1,1,1,186.21,29.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
4976,1,41.0,0,0,0,1,0,70.15,29.8,1,1
4977,1,40.0,0,0,1,1,1,191.15,31.1,3,1
4978,0,45.0,1,0,1,0,0,95.02,31.8,3,1
4979,1,40.0,0,0,1,1,0,83.94,30.0,3,1


In [195]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

# Assuming 'df' is your dataframe and 'Y' is your target column
X = df.drop('Y', axis=1)
y = df['Y']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a custom binning function that works on DataFrames
def custom_binning(df):
    return df.apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates='drop'))

# Define transformations
transformations = {
    'No Scaling': FunctionTransformer(lambda x: x),
    'Standard': StandardScaler(),
    'MinMax': MinMaxScaler(),
    'Robust': RobustScaler(),
    'Yeo-Johnson': PowerTransformer(method='yeo-johnson', standardize=True),
    'Binning': FunctionTransformer(custom_binning),
    'Quantile': QuantileTransformer(output_distribution='normal')
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
}

results = []

# Test each combination of transformation and model
for trans_name, transformer in transformations.items():
    for model_name, model in models.items():
        # Create a pipeline
        pipeline = Pipeline([
            ('transformer', transformer),
            ('model', model)
        ])
        
        # Perform cross-validation
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
        mean_cv_score = np.mean(cv_scores)
        
        # Fit on the entire training set and evaluate on test set
        pipeline.fit(X_train, y_train)
        test_score = pipeline.score(X_test, y_test)
        
        results.append({
            'Transformation': trans_name,
            'Model': model_name,
            'CV Mean Accuracy': mean_cv_score,
            'Test Accuracy': test_score
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Extract the "No Scaling" results for comparison
no_scaling_results = results_df[results_df['Transformation'] == 'No Scaling']

# Compare each transformation to "No Scaling"
comparison_results = []
for trans_name in transformations.keys():
    if trans_name == 'No Scaling':
        continue
    trans_results = results_df[results_df['Transformation'] == trans_name]
    for model_name in models.keys():
        no_scaling_cv = no_scaling_results[no_scaling_results['Model'] == model_name]['CV Mean Accuracy'].values[0]
        trans_cv = trans_results[trans_results['Model'] == model_name]['CV Mean Accuracy'].values[0]
        no_scaling_test = no_scaling_results[no_scaling_results['Model'] == model_name]['Test Accuracy'].values[0]
        trans_test = trans_results[trans_results['Model'] == model_name]['Test Accuracy'].values[0]
        cv_improvement = trans_cv - no_scaling_cv
        test_improvement = trans_test - no_scaling_test
        comparison_results.append({
            'Transformation': trans_name,
            'Model': model_name,
            'No Scaling CV Accuracy': no_scaling_cv,
            'Transformation CV Accuracy': trans_cv,
            'CV Improvement': cv_improvement,
            'No Scaling Test Accuracy': no_scaling_test,
            'Transformation Test Accuracy': trans_test,
            'Test Improvement': test_improvement
        })

# Convert comparison results to a DataFrame
comparison_df = pd.DataFrame(comparison_results)

# Compute average improvement for each transformation
average_improvement = comparison_df.groupby('Transformation')[['CV Improvement', 'Test Improvement']].mean().reset_index()
average_improvement = average_improvement.sort_values('CV Improvement', ascending=False)

# Display the average improvement
print("Average Improvement by Transformation:")
print(average_improvement)

# Display the top 10 results based on CV Improvement
comparison_df = comparison_df.sort_values('CV Improvement', ascending=False)
print("\nTop 10 Model-Transformation Combinations (based on CV Improvement):")
print(comparison_df[['Transformation', 'Model', 'CV Improvement', 'Test Improvement']].head(10))

# Get the best transformation and model combination
best_combination = comparison_df.iloc[0]
print(f"\nBest Combination:")
print(f"Transformation: {best_combination['Transformation']}")
print(f"Model: {best_combination['Model']}")
print(f"No Scaling CV Accuracy: {best_combination['No Scaling CV Accuracy']:.4f}")
print(f"Transformation CV Accuracy: {best_combination['Transformation CV Accuracy']:.4f}")
print(f"CV Improvement: {best_combination['CV Improvement']:.4f}")
print(f"No Scaling Test Accuracy: {best_combination['No Scaling Test Accuracy']:.4f}")
print(f"Transformation Test Accuracy: {best_combination['Transformation Test Accuracy']:.4f}")
print(f"Test Improvement: {best_combination['Test Improvement']:.4f}")

[LightGBM] [Info] Number of positive: 3032, number of negative: 155
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 625
[LightGBM] [Info] Number of data points in the train set: 3187, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.951365 -> initscore=2.973553
[LightGBM] [Info] Start training from score 2.973553
[LightGBM] [Info] Number of positive: 3032, number of negative: 155
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 625
[LightGBM] [Info] Number of data points in the train set: 3187, number of used features: 10
[LightGBM] [Info] [binary:Boos

In [196]:
# New cell

import os


# Add rank column to average_improvement
average_improvement['Rank'] = average_improvement['CV Improvement'].rank(ascending=False, method='min')
average_improvement = average_improvement.sort_values('Rank')

# Create directory if it doesn't exist
os.makedirs('result/transformation', exist_ok=True)

# Save average_improvement as CSV
average_improvement.to_csv(f'result/transformation/classification/{file_prefix}_average.csv', index=False)

# Save comparison_df as CSV
comparison_df.to_csv(f'result/transformation/classification/{file_prefix}_compared.csv', index=False)

print(f"Files saved successfully in the 'result/transformation' directory with prefix '{file_prefix}'.")


Files saved successfully in the 'result/transformation' directory with prefix 'stroke'.
