In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
file_path="./5G_energy_consumption_dataset.csv"

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


In [3]:
# Describe general information 
df.describe()

Unnamed: 0,Energy,load,ESMODE,TXpower
count,92629.0,92629.0,92629.0,92629.0
mean,28.138997,0.244705,0.081361,6.765427
std,13.934645,0.234677,0.382317,0.309929
min,0.747384,0.0,0.0,5.381166
25%,18.236173,0.05737,0.0,6.427504
50%,24.06577,0.16555,0.0,6.875934
75%,35.724963,0.363766,0.0,6.875934
max,100.0,0.993957,4.0,8.375336


In [4]:
# understanding the datatypes and the number of values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB


In [5]:
# display the avilable columns
df.columns
# changing time to the appropriate format
df['Time'] = pd.to_datetime(df['Time'], format='%Y%m%d %H%M%S', errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Time     92629 non-null  datetime64[ns]
 1   BS       92629 non-null  object        
 2   Energy   92629 non-null  float64       
 3   load     92629 non-null  float64       
 4   ESMODE   92629 non-null  float64       
 5   TXpower  92629 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 4.2+ MB


In [6]:
# checking for null values
df.isnull().sum()

Time       0
BS         0
Energy     0
load       0
ESMODE     0
TXpower    0
dtype: int64

In [7]:
# checking for duplicate value
df.duplicated().sum()

np.int64(0)

In [None]:
# Creating a pandas profilling report to gain insights on the data
# Generates html file containing the profiling report
profile = ProfileReport(df,title="5G Energy-consumption Profile Report",explorative=True)
profile_path = "Energy_consumption.html"
profile.to_file(profile_path)
print(f"\n Profiling report saved to:{profile_path}")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


 Profiling report saved to:Energy_consumption.html


In [9]:
# There are no missing values
# There are no duplicate values
# Handling outliers:Otliers are evident this is due to the desparity between the max 
# values and the mean
# create a function to handle outliers
def remove_outliers(df, column):
    lower_quartile = df[column].quantile(0.25)
    upper_quartile = df[column].quantile(0.75)
    Inter_quantile_range = upper_quartile - lower_quartile
    lower_bound = lower_quartile - 1.5 * Inter_quantile_range
    upper_bound = upper_quartile + 1.5 * Inter_quantile_range
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for numerical columns
df_cleaned = remove_outliers(df, 'Energy')
df_cleaned = remove_outliers(df_cleaned, 'load')
df_cleaned = remove_outliers(df_cleaned, 'TXpower')

# Displaying the new dataset size after outlier removal
df_cleaned.describe()

Unnamed: 0,Time,Energy,load,ESMODE,TXpower
count,86112,86112.0,86112.0,86112.0,86112.0
mean,2023-01-04 00:31:19.347826176,26.429044,0.212981,0.084211,6.749631
min,2023-01-01 01:00:00,0.747384,0.0,0.0,5.949178
25%,2023-01-02 10:00:00,18.086697,0.05308,0.0,6.427504
50%,2023-01-03 20:00:00,23.168909,0.15074,0.0,6.875934
75%,2023-01-05 14:00:00,32.735426,0.322301,0.0,6.875934
max,2023-01-08 00:00:00,61.883408,0.78638,4.0,7.325859
std,,11.896045,0.194809,0.390193,0.274595


In [10]:
# Encoding categorical features usiing one hot encoding 
# Encode categorical feature 'BS' as it has object data_type
label_encoder = LabelEncoder()
df_cleaned['BS'] = label_encoder.fit_transform(df_cleaned['BS'])

df_cleaned['BS']

1        0
2        0
3        0
4        0
20       0
        ..
92624    6
92625    6
92626    6
92627    6
92628    6
Name: BS, Length: 86112, dtype: int64

In [16]:
# splitting the data into training and testing data 
# Identifying the targets and features
X = df_cleaned[['BS','load','ESMODE','TXpower']]
y = df_cleaned['Energy']

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

# Training a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [17]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(mae)
print(rmse)
r2

6.321446037519323
8.335111296565145


0.5108426082064834

In [None]:
# How to improve models performance
# Handling outlier better instead of removing them : transformations
# Choosing a More Powerful Model