In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('ship_fuel_efficiency.csv')
df.head()

Unnamed: 0,ship_id,ship_type,route_id,month,distance,fuel_type,fuel_consumption,CO2_emissions,weather_conditions,engine_efficiency
0,NG001,Oil Service Boat,Warri-Bonny,January,132.26,HFO,3779.77,10625.76,Stormy,92.14
1,NG001,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,4461.44,12779.73,Moderate,92.98
2,NG001,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,1867.73,5353.01,Calm,87.61
3,NG001,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,2393.51,6506.52,Stormy,87.42
4,NG001,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,4267.19,11617.03,Calm,85.61


In [2]:
# Encoding categorical variables using Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['ship_type'] = le.fit_transform(df['ship_type'])
df['route_id'] = le.fit_transform(df['route_id'])
df['month'] = le.fit_transform(df['month'])
df['fuel_type'] = le.fit_transform(df['fuel_type'])
df['weather_conditions'] = le.fit_transform(df['weather_conditions'])

In [3]:
# feature seperation
x = df[['ship_type', 'route_id', 'month', 'distance', 'fuel_type', 'weather_conditions', 'engine_efficiency']]
y = df['CO2_emissions']

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [4]:
# Train Linear Regression model
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
y_pred_lr = lr_model.predict(x_test)

# model evaluation
from sklearn.metrics import mean_squared_error, r2_score
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression MSE:", mse_lr)
print("Linear Regression R2 Score:", r2_lr)

Linear Regression MSE: 40910820.884758905
Linear Regression R2 Score: 0.7749299628160684


In [5]:
# Random forest regressor
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators = 100, random_state = 0)
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)

# model evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest MSE:", mse_rf)
print("Random Forest R2 Score:", r2_rf)

Random Forest MSE: 16194172.396872057
Random Forest R2 Score: 0.9109080946140377


In [6]:
# gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 1, random_state = 0)
gb_model.fit(x_train, y_train)
y_pred_gb = gb_model.predict(x_test)

# model evaluation
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print("Gradient Boosting MSE:", mse_gb)
print("Gradient Boosting R2 Score:", r2_gb)

Gradient Boosting MSE: 15280503.874621272
Gradient Boosting R2 Score: 0.915934623141931


In [7]:
# XGBoost Model
from xgboost import XGBRegressor
xgb_model = XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 1, random_state = 0)
xgb_model.fit(x_train, y_train)
y_pred_xgb = xgb_model.predict(x_test)

# model evaluation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("XGBoost MSE:", mse_xgb)
print("XGBoost R2 Score:", r2_xgb)

XGBoost MSE: 14988418.322276665
XGBoost R2 Score: 0.9175415257829772


In [8]:
# LightGBM Model
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, verbosity=-1)
lgbm_model.fit(x_train, y_train)
y_pred_lgbm = lgbm_model.predict(x_test)

# model evaluation
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)
print("LightGBM MSE:", mse_lgbm)
print("LightGBM R2 Score:", r2_lgbm)

LightGBM MSE: 15089195.434482753
LightGBM R2 Score: 0.9169871025790184


In [10]:
# Cross validation for LightGBM
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lgbm_model, x_train, y_train, cv=5)
print("Cross-validation scores for LightGBM: ", scores)
print("Mean cross-validation score for LightGBM: ", scores.mean())

Cross-validation scores for LightGBM:  [0.93203048 0.9076252  0.90880021 0.90542138 0.89519516]
Mean cross-validation score for LightGBM:  0.9098144853786799


In [11]:
# Cross validation for XGBoost
scores = cross_val_score(xgb_model, x_train, y_train, cv=5)
print("Cross-validation scores for XGBoost: ", scores)
print("Mean cross-validation score for XGBoost: ", scores.mean())

Cross-validation scores for XGBoost:  [0.92825355 0.90805529 0.90848636 0.90081911 0.89415889]
Mean cross-validation score for XGBoost:  0.9079546397014887


In [12]:
# Cross validation for Gradient Boosting
scores = cross_val_score(gb_model, x_train, y_train, cv=5)
print("Cross-validation scores for Gradient Boosting: ", scores)
print("Mean cross-validation score for Gradient Boosting: ", scores.mean())


Cross-validation scores for Gradient Boosting:  [0.92918849 0.90708651 0.9081323  0.89888326 0.89337998]
Mean cross-validation score for Gradient Boosting:  0.9073341091712027


In [15]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'subsample': [0.5, 0.7, 0.9, 1.0],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Instantiate the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=0)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings to sample
    scoring='neg_mean_squared_error',
    cv=5,  # 5-fold cross-validation
    random_state=0,
    n_jobs=-1  # Use all available processors
)

# Fit RandomizedSearchCV
random_search.fit(x_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Use the best estimator to make predictions
best_gb_model = random_search.best_estimator_
y_pred_gb = best_gb_model.predict(x_test)

# Model evaluation
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print("Tuned Gradient Boosting MSE:", mse_gb)
print("Tuned Gradient Boosting R2 Score:", r2_gb)

Best Parameters: {'subsample': 0.5, 'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 2, 'learning_rate': 0.05}
Tuned Gradient Boosting MSE: 14505068.767174607
Tuned Gradient Boosting R2 Score: 0.9202006633897757


In [16]:
import joblib

# Save the Gradient Boosting model, StandardScaler, and PCA
joblib.dump(best_gb_model, 'model_co2.joblib') 
joblib.dump(sc, 'scaler_co2.joblib')  
joblib.dump(pca, 'pca_co2.joblib')  

print("✅ Gradient Boosting model, StandardScaler, and PCA saved successfully!")

✅ Gradient Boosting model, StandardScaler, and PCA saved successfully!
