In [37]:
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [38]:
import os
print(os.getcwd())

c:\Users\sebla\.docker\Mlops_Electricity_Predictor\app


In [39]:
dossier = r'C:\Users\sebla\.docker\Mlops_Electricity_Predictor\data'
print(os.listdir(dossier))

['electricity_cost_dataset.csv']


In [40]:
fichier_csv = os.path.join(dossier, 'electricity_cost_dataset.csv')

In [41]:
df = pd.read_csv(fichier_csv)
df.head()

Unnamed: 0,site area,structure type,water consumption,recycling rate,utilisation rate,air qality index,issue reolution time,resident count,electricity cost
0,1360,Mixed-use,2519.0,69,52,188,1,72,1420.0
1,4272,Mixed-use,2324.0,50,76,165,65,261,3298.0
2,3592,Mixed-use,2701.0,20,94,198,39,117,3115.0
3,966,Residential,1000.0,13,60,74,3,35,1575.0
4,4926,Residential,5990.0,23,65,32,57,185,4301.0


In [42]:
df.shape

(10000, 9)

In [43]:
df.isnull().sum().sort_values()

site area               0
structure type          0
water consumption       0
recycling rate          0
utilisation rate        0
air qality index        0
issue reolution time    0
resident count          0
electricity cost        0
dtype: int64

In [44]:
# Détect° des doublons :
print(f"Doublons : {df.duplicated().sum()}")

Doublons : 0


In [45]:
# Renommage des colonnes :
df.rename(columns={
    'site area': 'site_area',
    'structure type': 'structure_type',
    'water consumption': 'water_consumption',
    'recycling rate': 'recycling_rate',
    'utilisation rate': 'utilisation_rate',
    'air qality index': 'air_quality_index',
    'issue reolution time': 'issue_resolution_time',
    'resident count': 'resident_count',
    'electricity cost': 'electricity_cost'
}, inplace=True)

In [46]:
df.head()

Unnamed: 0,site_area,structure_type,water_consumption,recycling_rate,utilisation_rate,air_quality_index,issue_resolution_time,resident_count,electricity_cost
0,1360,Mixed-use,2519.0,69,52,188,1,72,1420.0
1,4272,Mixed-use,2324.0,50,76,165,65,261,3298.0
2,3592,Mixed-use,2701.0,20,94,198,39,117,3115.0
3,966,Residential,1000.0,13,60,74,3,35,1575.0
4,4926,Residential,5990.0,23,65,32,57,185,4301.0


In [47]:
# Modèle de ML : 
# Séparat° features/target
X = df.drop("electricity_cost", axis=1)
y = df["electricity_cost"]

# Préparat° des colonnes :
num_cols = X.select_dtypes(include="number").columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

In [48]:
# Pipeline de prétraitemt :
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# Pipeline cplet :
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LinearRegression())
])

In [49]:
# Séparat° des données :
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Entraînemt :
pipeline.fit(X_train, y_train)

In [50]:
# 8. Évaluat° :
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"✅ MSE : {mse:.2f}")
print(f"✅ R² : {r2:.2f}")

✅ MSE : 97381.90
✅ R² : 0.92


In [53]:
# Chemin absolu vers le dossier du projet
racine_projet = r'C:\Users\sebla\.docker\Mlops_Electricity_Predictor'

# Chemin vers le dossier de sauvegarde du modèle
chemin_modele = os.path.join(racine_projet, 'model', 'model.joblib')

# Sauvegarde du pipeline
joblib.dump(pipeline, chemin_modele)

print(f"✅ Modèle sauvegardé dans : {chemin_modele}")

✅ Modèle sauvegardé dans : C:\Users\sebla\.docker\Mlops_Electricity_Predictor\model\model.joblib


In [54]:
print(len(num_cols), len(cat_cols), len(num_cols) + len(cat_cols))

7 1 8


In [None]:
example = X.iloc[0]
print(example)

site_area                     1360
structure_type           Mixed-use
water_consumption           2519.0
recycling_rate                  69
utilisation_rate                52
air_quality_index              188
issue_resolution_time            1
resident_count                  72
Name: 0, dtype: object
