In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib

# Load the dataset
df = pd.read_csv("measurements.csv")
print(df.head())

  distance consume  speed temp_inside  temp_outside specials gas_type  AC  \
0       28       5     26        21,5            12      NaN      E10   0   
1       12     4,2     30        21,5            13      NaN      E10   0   
2     11,2     5,5     38        21,5            15      NaN      E10   0   
3     12,9     3,9     36        21,5            14      NaN      E10   0   
4     18,5     4,5     46        21,5            15      NaN      E10   0   

   rain  sun refill liters refill gas  
0     0    0            45        E10  
1     0    0           NaN        NaN  
2     0    0           NaN        NaN  
3     0    0           NaN        NaN  
4     0    0           NaN        NaN  


In [17]:
# Drop unnecessary columns
df.drop(['refill gas', 'refill liters', 'specials'], axis=1, inplace=True)

In [18]:
# Iterate through all columns and replace commas with periods for numeric representations
for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column is of object type (likely string)
        df[col] = df[col].str.replace(',', '.', regex=True)

In [19]:
# Convert numeric columns to float after replacing commas
for col in df.select_dtypes(include=['object']).columns:
    try:
        df[col] = df[col].astype(float)
    except ValueError:
        pass  # Handle columns that cannot be converted to float

In [20]:
# Fill missing values in 'temp_inside' column with mean
temp_inside_mean = df['temp_inside'].mean()
df['temp_inside'].fillna(temp_inside_mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['temp_inside'].fillna(temp_inside_mean, inplace=True)


In [21]:
# Convert categorical variable 'gas_type' into dummy variables
dum1 = pd.get_dummies(df['gas_type'])
df = pd.concat([df, dum1], axis=1)
df.drop('gas_type', axis=1, inplace=True)

In [36]:
dum1

Unnamed: 0,E10,SP98
0,True,False
1,True,False
2,True,False
3,True,False
4,True,False
...,...,...
383,False,True
384,False,True
385,False,True
386,False,True


In [22]:
# Split the dataset into features and target variables
x = df.drop('consume', axis=1).values
y = df['consume'].values

In [41]:
x.shape

(388, 9)

In [23]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [34]:
x_train

array([[12.3, 62, 21.5, ..., 0, True, False],
       [16.7, 44, 24.5, ..., 1, False, True],
       [15.4, 45, 22.0, ..., 0, True, False],
       ...,
       [16.0, 41, 22.0, ..., 0, True, False],
       [16.6, 50, 22.0, ..., 0, True, False],
       [18.8, 62, 21.929521276595743, ..., 0, False, True]], dtype=object)

In [25]:
# Create and train the random forest model
model = RandomForestRegressor()
model.fit(x_train, y_train)  # Correct the model fitting line

In [26]:
# Make predictions on the test set
y_pred = model.predict(x_test)
print(y_pred)

[5.428      5.233      5.379      4.563      4.154      5.399
 5.255      4.745      5.223      4.645      4.139      4.06466667
 5.524      5.55375    8.211      4.962      5.358      5.088
 5.011      4.58       4.301      5.189      4.429      4.735
 4.976      4.823      5.127      4.26       4.565      4.432
 5.10316667 4.945      4.354      4.229      5.389      5.11
 4.66519048 4.322      4.438      5.078      5.27       4.549
 4.57       5.009      5.49       4.364      8.367      4.287
 4.429      5.222      4.162      5.145      5.374      4.905
 4.981      3.99833333 5.108      5.494      4.939      4.517
 5.566      4.151      4.562      5.411      4.816      4.56
 8.249      3.928      4.006      5.094      4.457      4.58415476
 4.92016667 4.659      4.787      4.75633333 4.611      4.982
 5.38       4.685      5.079      5.0095     3.86       5.511
 5.714      5.235      5.121      5.044      4.667      3.852
 5.0105     4.225      4.8745     4.772      4.655      5.18
 

In [27]:
# Evaluate the model
mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)

Mean Squared Error: 0.3821795521791004
Mean Absolute Error: 0.4446656898656903
Root Mean Squared Error: 0.6182067228517499


In [28]:
# Save the model
joblib.dump(model, 'fleet_fuel.pkl')

['fleet_fuel.pkl']

In [29]:
from sklearn.model_selection import KFold, cross_val_score

# Choose a model (replace with your desired algorithm)
model = RandomForestRegressor() 

# Set up k-fold cross-validation (e.g., 5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate cross-validated RMSE (you can also use MAE or other metrics)
rmse_scores = cross_val_score(model, x, y, cv=kf, scoring='neg_root_mean_squared_error')

# Convert negative RMSE scores to positive
rmse_scores = -rmse_scores 

# Print average RMSE and standard deviation
print("Average RMSE:", rmse_scores.mean())
print("Standard Deviation of RMSE:", rmse_scores.std())

Average RMSE: 0.6887571449768667
Standard Deviation of RMSE: 0.0919128258979233


In [30]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

Mean Absolute Error: 0.4446656898656903
Mean Squared Error: 0.3821795521791004
R² Score: 0.5436583449392025
