In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
file_path = './merged_df.csv'  
df = pd.read_csv(file_path)

In [3]:
features = ['Greenhouse_Gas_Score', 'Air_Pollution_Score', 'Combined_Mpg', 'Engine_Displacement_L', 'Engine_Cylinders', 'Fuel', 'Drive']
target = 'Smartway'

#df['Smartway'] = df['Smartway'].map({'No': 0, 'Yes': 1, 'Elite': 2})
categorical_features = ['Fuel', 'Drive']
numerical_features = ['Greenhouse_Gas_Score', 'Air_Pollution_Score', 'Combined_Mpg', 'Engine_Displacement_L', 'Engine_Cylinders']

#df.dropna(subset=[target] + features, inplace=True)

In [4]:
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
transformer = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

log_model = Pipeline([
    ('preprocess', transformer),
    ('classifier', LogisticRegression(max_iter=1000))
])
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)

# Train Decision Tree Model
dt_model = Pipeline([
    ('preprocess', transformer),
    ('classifier', DecisionTreeClassifier(max_depth=5))
])
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)


In [6]:
print("Logistic Regression Performance:")
print(classification_report(y_test, log_preds))
print("Accuracy:", accuracy_score(y_test, log_preds))

print("\nDecision Tree Performance:")
print(classification_report(y_test, dt_preds))
print("Accuracy:", accuracy_score(y_test, dt_preds))


Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      4183
           1       0.85      0.86      0.86      1157
           2       0.54      0.41      0.46        37

    accuracy                           0.94      5377
   macro avg       0.78      0.74      0.76      5377
weighted avg       0.94      0.94      0.94      5377

Accuracy: 0.9382555328249953

Decision Tree Performance:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4183
           1       0.96      0.92      0.94      1157
           2       0.93      0.38      0.54        37

    accuracy                           0.97      5377
   macro avg       0.96      0.76      0.82      5377
weighted avg       0.97      0.97      0.97      5377

Accuracy: 0.9739631764924679


Higher accuracy (97.36% vs. 93.66%)

✅ Better precision & recall for most classes

✅ Handles non-linearity better than Logistic Regression

In [7]:
import joblib

# Save the model
joblib.dump(dt_model, "smartway_decision_tree.pkl")


['smartway_decision_tree.pkl']