In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load preprocessed data
df = pd.read_csv("../data/processed/dht11_preprocessed.csv")

In [2]:
df

Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,datatime,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
0,32.3,53.0,148.0,1,192.0,0,141.0,2024-04-06 22:39:11,1,205.0,1,242.28,186.67
1,32.3,53.0,148.0,1,184.0,0,141.0,2024-04-06 22:39:18,1,205.0,1,233.98,189.99
2,32.3,53.0,148.0,1,189.0,0,141.0,2024-04-06 22:39:24,1,205.0,1,246.43,212.40
3,32.3,53.0,147.0,1,189.0,0,141.0,2024-04-06 22:39:31,1,205.0,1,234.81,183.35
4,32.3,53.0,147.0,1,184.0,0,141.0,2024-04-06 22:39:38,1,205.0,1,237.30,221.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39294,30.8,39.0,31.0,1,120.0,0,175.0,2024-11-14 22:43:39,1,99.0,1,241.45,214.89
39295,30.8,39.0,31.0,1,153.0,0,176.0,2024-11-14 22:43:45,1,100.0,1,250.58,201.61
39296,30.8,39.0,31.0,1,115.0,0,175.0,2024-11-14 22:43:52,1,99.0,1,237.30,210.74
39297,30.8,39.0,31.0,1,121.0,0,175.0,2024-11-14 22:43:59,1,99.0,1,234.81,205.76


In [6]:
# Create statistical features for the scaled dataset
stat_features = df.describe().T[['mean', 'min', 'max', 'std']]

# Reset index for better readability
stat_features = stat_features.reset_index().rename(columns={'index': 'Feature'})

# Display the statistical features
print("Statistical Features (Mean, Min, Max, Standard Deviation):")
print(stat_features)


Statistical Features (Mean, Min, Max, Standard Deviation):
          Feature        mean   min      max        std
0     temperature   31.350121 -18.6    40.60   6.485694
1        humidity   49.537927   0.0    83.00  15.506969
2      mq2_analog   55.102038   0.0  1019.00  53.237731
3     mq2_digital    0.979643   0.0     1.00   0.141219
4    sound_analog  133.182931   0.0   785.00  41.812949
5   sound_digital    0.146696   0.0     1.00   0.353807
6      mq9_analog  182.916614   0.0   638.00  45.419716
7     mq9_digital    0.940787   0.0     1.00   0.236025
8      mq8_analog  115.381155   0.0  1023.00  63.922832
9     mq8_digital    0.966997   0.0     1.00   0.178648
10   pm25_density  231.088810  -0.1   849.07  42.157674
11   pm10_density  188.149438  -0.1   808.40  38.679286


In [7]:
from sklearn.preprocessing import StandardScaler

# Select the relevant columns (excluding datetime and any non-numeric columns)
cols_to_scale = ['temperature', 'humidity', 'mq2_analog', 'mq2_digital', 
                 'sound_analog', 'sound_digital', 'mq9_analog', 'mq9_digital', 
                 'mq8_analog', 'mq8_digital', 'pm25_density', 'pm10_density']

# Initialize StandardScaler
scaler = StandardScaler()

# Apply scaling to the selected columns
df_scaled = df[cols_to_scale].copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df_scaled[cols_to_scale])

# Now calculate correlations after scaling
correlation_matrix = df_scaled.corr()

# Display the correlation matrix
correlation_matrix


Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
temperature,1.0,0.427505,0.137798,-0.014639,0.02184,0.099146,0.035187,-0.046872,0.138455,0.010024,0.046443,0.026377
humidity,0.427505,1.0,-0.153174,0.099123,-0.130519,0.089505,-0.136753,-0.050964,-0.267821,0.033534,0.194565,-0.101059
mq2_analog,0.137798,-0.153174,1.0,-0.132095,0.55378,0.246226,0.427902,-0.127499,0.707836,-0.124795,-0.054048,0.078869
mq2_digital,-0.014639,0.099123,-0.132095,1.0,-0.022783,-0.255994,0.189962,0.352428,0.020902,0.771208,0.210066,0.026232
sound_analog,0.02184,-0.130519,0.55378,-0.022783,1.0,-0.393567,0.359485,-0.123499,0.166457,-0.064507,0.091738,0.17142
sound_digital,0.099146,0.089505,0.246226,-0.255994,-0.393567,1.0,-0.097201,-0.103191,0.385956,-0.17784,-0.151485,-0.174302
mq9_analog,0.035187,-0.136753,0.427902,0.189962,0.359485,-0.097201,1.0,-0.180214,0.576299,0.104303,0.039888,0.064292
mq9_digital,-0.046872,-0.050964,-0.127499,0.352428,-0.123499,-0.103191,-0.180214,1.0,-0.051757,0.268073,-0.093451,-0.144147
mq8_analog,0.138455,-0.267821,0.707836,0.020902,0.166457,0.385956,0.576299,-0.051757,1.0,0.105206,-0.13232,0.006951
mq8_digital,0.010024,0.033534,-0.124795,0.771208,-0.064507,-0.17784,0.104303,0.268073,0.105206,1.0,0.08806,-0.040723


In [8]:
# Filter correlations greater than 0.3 and less than 1.0
high_corr = correlation_matrix.where((correlation_matrix > 0.3) & (correlation_matrix < 1.0))

# Drop NaN values to focus on relevant correlations
high_corr = high_corr.stack().reset_index()

# Rename columns for clarity
high_corr.columns = ["Parameter 1", "Parameter 2", "Correlation"]

# Ensure unique pairs by sorting parameters and dropping duplicates
high_corr["Sorted Pair"] = high_corr[["Parameter 1", "Parameter 2"]].apply(lambda x: tuple(sorted(x)), axis=1)
unique_corr = high_corr.drop_duplicates(subset="Sorted Pair").drop(columns="Sorted Pair")

# Print the unique pairs
print("Unique pairs with correlation greater than 0.3:")
for _, row in unique_corr.iterrows():
    print(f"{row['Parameter 1']} and {row['Parameter 2']}: {row['Correlation']:.2f}")


Unique pairs with correlation greater than 0.3:
temperature and humidity: 0.43
mq2_analog and sound_analog: 0.55
mq2_analog and mq9_analog: 0.43
mq2_analog and mq8_analog: 0.71
mq2_digital and mq9_digital: 0.35
mq2_digital and mq8_digital: 0.77
sound_analog and mq9_analog: 0.36
sound_digital and mq8_analog: 0.39
mq9_analog and mq8_analog: 0.58
pm25_density and pm10_density: 0.41
