In [78]:
import numpy as np
import pandas as pd

file_prefix = "plane"  # Change this to any word you like

df =  pd.read_csv(f'Dataset/{file_prefix}.csv')
df.dropna(inplace=True)

In [80]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Identify categorical data (change this based on your actual data)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Apply Label Encoding to each categorical column
for column in categorical_cols:
    # Ensure the column is of type object (string) or category
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        df[column] = le.fit_transform(df[column])
        


In [81]:
df

Unnamed: 0,Model Name,Engine Type,HP or lbs thr ea engine,Max speed Knots,Rcmnd cruise Knots,Stall Knots dirty,Fuel gal/lbs,All eng rate of climb,Eng out rate of climb,Takeoff over 50ft,Landing over 50ft,Empty weight lbs,Length ft/in,Wing span ft/in,Range N.M.,Y
0,0,1,11,5,91.0,46.0,36,147,900.0,1300.0,40,14,40,44,107,1300000.0
1,85,1,79,171,83.0,44.0,15,165,720.0,800.0,5,333,8,33,66,1230000.0
2,0,1,81,172,78.0,37.0,19,169,475.0,850.0,3,331,14,24,81,1600000.0
3,84,1,79,170,78.0,37.0,19,166,500.0,850.0,3,330,14,24,81,1300000.0
4,0,1,67,169,74.0,33.0,14,131,632.0,885.0,0,317,14,24,63,1250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,68,2,8,129,265.0,64.0,402,75,2300.0,1830.0,164,278,130,92,71,3000000.0
513,0,2,73,142,288.0,89.0,578,103,2280.0,3043.0,20,298,126,85,52,3584000.0
514,69,2,73,135,275.0,87.0,390,101,3230.0,3017.0,20,288,126,85,44,3200000.0
515,49,2,65,131,270.0,77.0,382,80,2940.0,2446.0,159,273,113,65,34,3240000.0


In [82]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline

# Assuming 'df' is your dataframe and 'Y' is your target column
X = df.drop('Y', axis=1)
y = df['Y']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a custom binning function that works on DataFrames
def custom_binning(df):
    return df.apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates='drop'))

# Define transformations
transformations = {
    'No Scaling': FunctionTransformer(lambda x: x),
    'Standard': StandardScaler(),
    'MinMax': MinMaxScaler(),
    'Robust': RobustScaler(),
    'Yeo-Johnson': PowerTransformer(method='yeo-johnson', standardize=True),
    'Binning': FunctionTransformer(custom_binning),
    'Quantile': QuantileTransformer(output_distribution='normal')
}

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
}

results = []

# Test each combination of transformation and model
for trans_name, transformer in transformations.items():
    for model_name, model in models.items():
        # Create a pipeline
        pipeline = Pipeline([
            ('transformer', transformer),
            ('model', model)
        ])
        
        # Perform cross-validation
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
        mean_cv_score = np.mean(cv_scores)
        
        # Fit on the entire training set and evaluate on test set
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        test_score = r2_score(y_test, y_pred)
        
        results.append({
            'Transformation': trans_name,
            'Model': model_name,
            'CV Mean R2': mean_cv_score,
            'Test R2': test_score
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Extract the "No Scaling" results for comparison
no_scaling_results = results_df[results_df['Transformation'] == 'No Scaling']

# Compare each transformation to "No Scaling"
comparison_results = []
for trans_name in transformations.keys():
    if trans_name == 'No Scaling':
        continue
    trans_results = results_df[results_df['Transformation'] == trans_name]
    for model_name in models.keys():
        no_scaling_cv = no_scaling_results[no_scaling_results['Model'] == model_name]['CV Mean R2'].values[0]
        trans_cv = trans_results[trans_results['Model'] == model_name]['CV Mean R2'].values[0]
        no_scaling_test = no_scaling_results[no_scaling_results['Model'] == model_name]['Test R2'].values[0]
        trans_test = trans_results[trans_results['Model'] == model_name]['Test R2'].values[0]
        cv_improvement = trans_cv - no_scaling_cv
        test_improvement = trans_test - no_scaling_test
        comparison_results.append({
            'Transformation': trans_name,
            'Model': model_name,
            'No Scaling CV R2': no_scaling_cv,
            'Transformation CV R2': trans_cv,
            'CV Improvement': cv_improvement,
            'No Scaling Test R2': no_scaling_test,
            'Transformation Test R2': trans_test,
            'Test Improvement': test_improvement
        })

# Convert comparison results to a DataFrame
comparison_df = pd.DataFrame(comparison_results)

# Compute average improvement for each transformation
average_improvement = comparison_df.groupby('Transformation')[['CV Improvement', 'Test Improvement']].mean().reset_index()
average_improvement = average_improvement.sort_values('CV Improvement', ascending=False)

# Display the average improvement
print("Average Improvement by Transformation:")
print(average_improvement)

# Display the top 10 results based on CV Improvement
comparison_df = comparison_df.sort_values('CV Improvement', ascending=False)
print("\nTop 10 Model-Transformation Combinations (based on CV Improvement):")
print(comparison_df[['Transformation', 'Model', 'CV Improvement', 'Test Improvement']].head(10))

# Get the best transformation and model combination
best_combination = comparison_df.iloc[0]
print(f"\nBest Combination:")
print(f"Transformation: {best_combination['Transformation']}")
print(f"Model: {best_combination['Model']}")
print(f"No Scaling CV R2: {best_combination['No Scaling CV R2']:.4f}")
print(f"Transformation CV R2: {best_combination['Transformation CV R2']:.4f}")
print(f"CV Improvement: {best_combination['CV Improvement']:.4f}")
print(f"No Scaling Test R2: {best_combination['No Scaling Test R2']:.4f}")
print(f"Transformation Test R2: {best_combination['Transformation Test R2']:.4f}")
print(f"Test Improvement: {best_combination['Test Improvement']:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 268, number of used features: 15
[LightGBM] [Info] Start training from score 2290827.238806
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 894
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 15
[LightGBM] [Info] Start training from score 2290755.762082
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 898
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 15
[LightGBM] [Info] Start 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 912
[LightGBM] [Info] Number of data points in the train set: 268, number of used features: 15
[LightGBM] [Info] Start training from score 2290827.238806
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 899
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 15
[LightGBM] [Info] Start training from score 2290755.762082
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 15
[LightGBM] [Info] Start 



In [83]:
# New cell

import os

# Add rank column to average_improvement
average_improvement['Rank'] = average_improvement['CV Improvement'].rank(ascending=False, method='min')
average_improvement = average_improvement.sort_values('Rank')

# Create directory if it doesn't exist
os.makedirs('result/transformation', exist_ok=True)

# Save average_improvement as CSV
average_improvement.to_csv(f'result/transformation/regression/{file_prefix}_average.csv', index=False)

# Save comparison_df as CSV
comparison_df.to_csv(f'result/transformation/regression/{file_prefix}_compared.csv', index=False)

print(f"Files saved successfully in the 'result/transformation' directory with prefix '{file_prefix}'.")


Files saved successfully in the 'result/transformation' directory with prefix 'plane'.
