<a href="https://colab.research.google.com/github/TraLauren/TraciaLaurenCV/blob/master/data_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
cold_data = pd.read_csv('/content/drive/MyDrive/cold_source_control_dataset.csv',sep=",")
cold_data.head()

Unnamed: 0,Timestamp,Server_Workload(%),Inlet_Temperature(°C),Outlet_Temperature(°C),Ambient_Temperature(°C),Cooling_Unit_Power_Consumption(kW),Chiller_Usage(%),AHU_Usage(%),Total_Energy_Cost($),Temperature_Deviation(°C),Cooling_Strategy_Action,Output
0,2025-01-01 00:00:00,100.0,24.726323,28.469324,21.424454,0.98,94.205859,45.462911,0.09,2.58,Reduce AHU,1
1,2025-01-01 01:00:00,87.358993,23.085262,29.854364,26.420271,0.95,85.776996,53.185858,0.1,2.42,Eco Mode,4
2,2025-01-01 02:00:00,19.3138,16.288791,23.381846,21.452172,0.43,28.148672,40.833469,0.06,2.55,Eco Mode,4
3,2025-01-01 03:00:00,41.40668,19.21728,24.323699,19.200412,0.61,49.300227,42.490922,0.08,4.8,Boost All,3
4,2025-01-01 04:00:00,49.514889,19.99174,23.205543,26.768503,0.69,49.656127,58.10089,0.1,2.77,Eco Mode,4


In [11]:
cold_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3498 entries, 0 to 3497
Data columns (total 12 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Timestamp                           3498 non-null   object 
 1   Server_Workload(%)                  3498 non-null   float64
 2   Inlet_Temperature(°C)               3498 non-null   float64
 3   Outlet_Temperature(°C)              3498 non-null   float64
 4   Ambient_Temperature(°C)             3498 non-null   float64
 5   Cooling_Unit_Power_Consumption(kW)  3498 non-null   float64
 6   Chiller_Usage(%)                    3498 non-null   float64
 7   AHU_Usage(%)                        3498 non-null   float64
 8   Total_Energy_Cost($)                3498 non-null   float64
 9   Temperature_Deviation(°C)           3498 non-null   float64
 10  Cooling_Strategy_Action             3498 non-null   object 
 11  Output                              3498 no

# Preprocessing

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [12]:
missing_values = cold_data.isnull().sum()
print("Missing values per column:\n", missing_values)

Missing values per column:
 Timestamp                             0
Server_Workload(%)                    0
Inlet_Temperature(°C)                 0
Outlet_Temperature(°C)                0
Ambient_Temperature(°C)               0
Cooling_Unit_Power_Consumption(kW)    0
Chiller_Usage(%)                      0
AHU_Usage(%)                          0
Total_Energy_Cost($)                  0
Temperature_Deviation(°C)             0
Cooling_Strategy_Action               0
Output                                0
dtype: int64


No missing data

## IQR

In [22]:
def detect_iqr_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] < lower) | (data[column] > upper)]


In [23]:
numerical_cols = cold_data.select_dtypes(include=np.number)

iqr_outlier_count = 0
for col in numerical_cols:
    outliers = detect_iqr_outliers(cold_data, col)
    iqr_outlier_count += len(outliers)

print("Total IQR outliers detected:", iqr_outlier_count)


Total IQR outliers detected: 121


## Z-Score Method

In [27]:
from scipy.stats import zscore


In [29]:
numerical_cols = cold_data.select_dtypes(include=['int64', 'float64']).columns


In [30]:
print(numerical_cols)


Index(['Server_Workload(%)', 'Inlet_Temperature(°C)', 'Outlet_Temperature(°C)',
       'Ambient_Temperature(°C)', 'Cooling_Unit_Power_Consumption(kW)',
       'Chiller_Usage(%)', 'AHU_Usage(%)', 'Total_Energy_Cost($)',
       'Temperature_Deviation(°C)', 'Output'],
      dtype='object')


In [31]:
X = cold_data[numerical_cols].to_numpy(dtype=float)

mean = np.nanmean(X, axis=0)
std = np.nanstd(X, axis=0)

z_scores = np.abs((X - mean) / std)
z_outliers = np.nan_to_num(z_scores).any(axis=1) > 3

print("Total Z-score outliers detected:", z_outliers.sum())


Total Z-score outliers detected: 0
