In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline


In [65]:
#loading dataset
df = pd.read_csv('Data/ship_fuel_efficiency.csv')

In [66]:
df.columns

Index(['ship_id', 'ship_type', 'route_id', 'month', 'distance', 'fuel_type',
       'fuel_consumption', 'CO2_emissions', 'weather_conditions',
       'engine_efficiency', 'fuel_efficiency', 'fuel_usage'],
      dtype='object')

In [67]:
df = df.drop(['fuel_consumption','CO2_emissions','fuel_efficiency','engine_efficiency'], axis=1)

In [68]:
df.head()

Unnamed: 0,ship_id,ship_type,route_id,month,distance,fuel_type,weather_conditions,fuel_usage
0,NG001,Oil Service Boat,Warri-Bonny,January,132.26,HFO,Stormy,3
1,NG001,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,Moderate,3
2,NG001,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,Calm,2
3,NG001,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,Stormy,2
4,NG001,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,Calm,3


In [69]:
#making weather conditions numerical
def weahther_cat(weather_conditions):
    if weather_conditions <= 'Calm':
        return 1
    elif weather_conditions <= 'Moderate':
        return 2
    elif weather_conditions <= 'Stormy':
        return 3
    else:
        return 0


df['weather_rating'] = df['weather_conditions'].apply(weahther_cat)
df.head()

Unnamed: 0,ship_id,ship_type,route_id,month,distance,fuel_type,weather_conditions,fuel_usage,weather_rating
0,NG001,Oil Service Boat,Warri-Bonny,January,132.26,HFO,Stormy,3,3
1,NG001,Oil Service Boat,Port Harcourt-Lagos,February,128.52,HFO,Moderate,3,2
2,NG001,Oil Service Boat,Port Harcourt-Lagos,March,67.3,HFO,Calm,2,1
3,NG001,Oil Service Boat,Port Harcourt-Lagos,April,71.68,Diesel,Stormy,2,3
4,NG001,Oil Service Boat,Lagos-Apapa,May,134.32,HFO,Calm,3,1


In [70]:
df = df.drop(['weather_conditions'], axis=1)

In [71]:
X = df.drop('fuel_usage', axis=1)
y = df['fuel_usage']

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ship_id         1440 non-null   object 
 1   ship_type       1440 non-null   object 
 2   route_id        1440 non-null   object 
 3   month           1440 non-null   object 
 4   distance        1440 non-null   float64
 5   fuel_type       1440 non-null   object 
 6   fuel_usage      1440 non-null   int64  
 7   weather_rating  1440 non-null   int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 90.1+ KB


In [73]:
#setting numeric and categorical features
numeric_features = ['distance', 'weather_rating']
categorical_features = ['ship_id','ship_type','route_id','month','fuel_type']

In [74]:
#setting up preprocessing steps
numeric_transformer = StandardScaler()
categoric_transformer = OneHotEncoder(handle_unknown='ignore')

In [75]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categoric_transformer, categorical_features)
    ])

In [76]:
model = RandomForestRegressor(random_state=42)

In [77]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [78]:
#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
#Train the model
pipeline.fit(X_train, y_train)

In [84]:
new_data = pd.DataFrame({
    'ship_id': ['NG002'],
    'ship_type': ['Container Ship'],
    'route_id': ['Warrie-Bonny'],
    'month': ['April'],
    'distance': [121.80],
    'fuel_type': ['Diesel'],
    'weather_rating': [1]
    
})

In [87]:
# Verify the column names and types
print("New Data Column Names:", new_data.columns)
print("Expected Column Names:", X.columns)

New Data Column Names: Index(['ship_id', 'ship_type', 'route_id', 'month', 'distance', 'fuel_type',
       'weather_rating'],
      dtype='object')
Expected Column Names: Index(['ship_id', 'ship_type', 'route_id', 'month', 'distance', 'fuel_type',
       'weather_rating'],
      dtype='object')


In [89]:
fueling = pipeline.predict(new_data)

TypeError: Cannot convert numpy.ndarray to numpy.ndarray