In [2]:
# Thus, the objective is to build and train a ML model to estimate the energy consumed 
# by different 5G base stations taking into consideration the impact of various engineering configurations, 
# traffic conditions, and energy-saving methods.

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [4]:
data=pd.read_csv("5G_energy_consumption_dataset.csv")
data.head(3)

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719


In [5]:
data.isnull().sum()

Time       0
BS         0
Energy     0
load       0
ESMODE     0
TXpower    0
dtype: int64

In [6]:
data.columns


Index(['Time', 'BS', 'Energy', 'load', 'ESMODE', 'TXpower'], dtype='object')

In [7]:
data['Time'].duplicated().sum()

92461

In [8]:
data1=data.drop(columns="Time")
data1.head(4)

Unnamed: 0,BS,Energy,load,ESMODE,TXpower
0,B_0,64.275037,0.487936,0.0,7.101719
1,B_0,55.904335,0.344468,0.0,7.101719
2,B_0,57.698057,0.193766,0.0,7.101719
3,B_0,55.156951,0.222383,0.0,7.101719


In [9]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BS       92629 non-null  object 
 1   Energy   92629 non-null  float64
 2   load     92629 non-null  float64
 3   ESMODE   92629 non-null  float64
 4   TXpower  92629 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.5+ MB


In [10]:
# we are predicting the energy consumption 

In [11]:
# getting the unique values in the Base station 
data1.BS.nunique()

923

In [None]:
# FEATURE ENGINEERING ON MY DATA 


In [13]:
# checking on the outiers in the ESMODE column the nergy saving mode ?

In [14]:
# select the relevant columns
data2=data1
data2.columns

Index(['BS', 'Energy', 'load', 'ESMODE', 'TXpower'], dtype='object')

In [15]:
# perform encoding on the Base station  column
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data2["BS"]=le.fit_transform(data2['BS'])

In [16]:
data2.BS.unique()

array([  0,   1, 117, 227, 336, 447, 558, 668, 779, 859,   2,  19,  30,
        51,  62,  73,  84,  95, 106, 118, 129, 140, 151, 162, 173, 184,
       195, 205, 216, 228, 239, 249, 260, 271, 282, 293, 303, 314, 325,
       337, 348, 359, 370, 381, 392, 403, 414, 425, 436, 448, 459, 470,
       481, 492, 503, 514, 525, 536, 547, 559, 570, 581, 592, 603, 614,
       624, 635, 646, 657, 669, 680, 691, 702, 713, 724, 735, 746, 757,
       768, 780, 791, 802, 811, 820, 825, 834, 840, 846, 854, 860, 866,
       868, 876, 882, 887, 892, 898, 905, 912,   3,   9,  11,  12,  13,
        14,  15,  16,  17,  18,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  49,  50,  52,  53,  54,  55,
        56,  57,  58,  59,  60,  61,  63,  64,  65,  66,  67,  68,  69,
        70,  71,  72,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  96,  9

In [17]:
# defining my x and y
x=data2.drop(columns="Energy")
y=data2['Energy']

In [18]:
print("my X columns is ")
x.columns

my X columns is 


Index(['BS', 'load', 'ESMODE', 'TXpower'], dtype='object')

In [19]:
print("my y values are is ")
y

my y values are is 


0        64.275037
1        55.904335
2        57.698057
3        55.156951
4        56.053812
           ...    
92624    14.648729
92625    14.648729
92626    13.452915
92627    13.602392
92628    13.303438
Name: Energy, Length: 92629, dtype: float64

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(0,1))

In [22]:
x_scaled=scaler.fit_transform(x)

In [23]:
y_scaled = scaler.fit_transform(y.values.reshape(-1, 1))

In [24]:
X_train, X_test, y_train, y_test=train_test_split(x_scaled,y_scaled,test_size=0.25)

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
model=LinearRegression()
model.fit(X_train,y_train)

In [27]:
model.score(X_test,y_test)

0.5571427107595865

In [None]:
# CHECKING THE PROBLEM OF UNDER FITTING OR OVERFITTING 

In [29]:
model.score(X_train,y_train)

0.5538161625295182

In [33]:
import warnings
warnings.filterwarnings("ignore")

In [31]:
ypredicted=model.predict(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse = mean_squared_error(y_test, ypredicted)
mae = mean_absolute_error(y_test, ypredicted)
r2 = r2_score(y_test, ypredicted)
print(f"MSE: {mse}, MAE: {mae}, R2: {r2}")

MSE: 0.008635167040545807, MAE: 0.06921230575044632, R2: 0.5571427107595865


In [None]:
# RECOMMENDATIONS  I  HAVE TRIED TO USE OTHER TECHNIQUES LIKE SCALING BUT MY MODEL IS NOT STILL IMPROVING IN THE SCORE 
# 1. to apply other techniques like kfold or polynomial linear regression 
# MY RECOMMENDATION IS TO SOLVE THE PROBLEM USING OTHER MODDELS LIKE THE SVM OR THE NAIVE BAYES , or the Random forest AS IMPLEMNETED BELOW 

In [47]:
from sklearn.preprocessing import PolynomialFeatures
pol=PolynomialFeatures(degree=2)

In [71]:
x=pol.fit_transform(x)

In [73]:
y=data2.Energy.values.reshape(-1,1)

In [79]:
y=pol.fit_transform(y)

In [81]:
model2=LinearRegression()
model2.fit(x,y)

In [83]:
model2.score(x,y)

0.7145465829625218

In [None]:
#  as observed with the ploynomial technique the model has increased its accuracy 

In [None]:
#  applying kfold cross validation technique

In [87]:
data3=data2
data3.head(3)

Unnamed: 0,BS,Energy,load,ESMODE,TXpower
0,0,64.275037,0.487936,0.0,7.101719
1,0,55.904335,0.344468,0.0,7.101719
2,0,57.698057,0.193766,0.0,7.101719


In [91]:
from sklearn.model_selection import StratifiedKFold
kf=StratifiedKFold(n_splits=5)

In [99]:
x=data3.drop(columns="Energy")
y=data3.Energy

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.25)

In [109]:
from sklearn.model_selection import cross_val_score
model_scores=cross_val_score(LinearRegression(),x,y)
model_scores

array([0.534275  , 0.61494298, 0.59767298, 0.52906516, 0.41738933])

In [115]:
results=np.average(model_scores)
results

0.5386690909515265

In [None]:
# YOU Realise that when you appy KFOLD crossvalidation the model is perfoming avaregly so the best option is to try other models