In [9]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/raw/dht11_new.csv")  # Replace "your_data.csv" with the actual file name

In [10]:
# Drop the 'id' column from the dataset
df = df.drop(columns=['id'])

# Verify that the column has been dropped
df.head()


Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,datatime,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
0,32.3,53.0,148.0,1,192.0,0,141.0,2024-04-06 22:39:11,1,205.0,1,242.28,186.67
1,32.3,53.0,148.0,1,184.0,0,141.0,2024-04-06 22:39:18,1,205.0,1,233.98,189.99
2,32.3,53.0,148.0,1,189.0,0,141.0,2024-04-06 22:39:24,1,205.0,1,246.43,212.4
3,32.3,53.0,147.0,1,189.0,0,141.0,2024-04-06 22:39:31,1,205.0,1,234.81,183.35
4,32.3,53.0,147.0,1,184.0,0,141.0,2024-04-06 22:39:38,1,205.0,1,237.3,221.53


In [11]:
# Check for null values in the dataset
null_values = df.isnull().sum()

# Print the number of null values per column
print("Null values per column:\n", null_values)


Null values per column:
 temperature      0
humidity         0
mq2_analog       0
mq2_digital      0
sound_analog     0
sound_digital    0
mq9_analog       0
datatime         0
mq9_digital      0
mq8_analog       0
mq8_digital      0
pm25_density     0
pm10_density     0
dtype: int64


In [15]:
df.count()

temperature      40171
humidity         40171
mq2_analog       40171
mq2_digital      40171
sound_analog     40171
sound_digital    40171
mq9_analog       40171
datatime         40171
mq9_digital      40171
mq8_analog       40171
mq8_digital      40171
pm25_density     40171
pm10_density     40171
dtype: int64

In [14]:
# Find the total number of unique values for each column
unique_values = df.nunique()

# Display the result
print(unique_values)


temperature        101
humidity            58
mq2_analog         411
mq2_digital          2
sound_analog       383
sound_digital        2
mq9_analog         376
datatime         39299
mq9_digital          2
mq8_analog         512
mq8_digital          2
pm25_density       510
pm10_density       505
dtype: int64


In [17]:
# Count the number of duplicate rows based on 'datatime' column
duplicate_datetime_count = df[df.duplicated(subset=['datatime'], keep=False)]

# Display the number of duplicate rows
print(f"Number of duplicate rows based on 'datatime': {len(duplicate_datetime_count)}")

# Display the first few duplicate rows for review
duplicate_datetime_count.head()


Number of duplicate rows based on 'datatime': 919


Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,datatime,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
76,32.3,54.0,174.0,1,193.0,0,206.0,2024-04-06 22:48:17,1,268.0,1,240.62,187.5
77,32.3,54.0,170.0,1,191.0,0,198.0,2024-04-06 22:48:17,1,261.0,1,235.64,178.37
78,32.3,54.0,171.0,1,192.0,0,203.0,2024-04-06 22:48:17,1,265.0,1,237.3,182.52
79,32.3,54.0,171.0,1,192.0,0,201.0,2024-04-06 22:48:17,1,262.0,1,236.47,196.63
80,32.3,53.0,175.0,1,195.0,0,207.0,2024-04-06 22:48:17,1,269.0,1,237.3,197.46


In [21]:
duplicate_datetime_count['datatime'].nunique()

47

In [23]:

# mean for all columns except 'datatime'
min_values = df.min()
max_values = df.max()
mean_values = df.drop(columns=['datatime']).mean()

# Combine the results into a single DataFrame
summary_stats = pd.DataFrame({
    'Min': min_values,
    'Max': max_values,
    'Mean': mean_values
})

print(summary_stats)


                               Min                  Max        Mean
datatime       2024-04-06 22:39:11  2024-11-14 22:44:06         NaN
humidity                       0.0                 83.0   49.426776
mq2_analog                     0.0               1019.0   54.920938
mq2_digital                      0                    1    0.980010
mq8_analog                     0.0               1023.0  115.133629
mq8_digital                      0                    1    0.967638
mq9_analog                     0.0                638.0  182.800229
mq9_digital                      0                    1    0.941998
pm10_density                  -0.1                808.4  188.155494
pm25_density                  -0.1               849.07  230.942126
sound_analog                   0.0                785.0  133.376018
sound_digital                    0                    1    0.144233
temperature                  -18.6                 40.6   31.382054


In [25]:
# Drop 'datatime' and 'digital' columns (to avoid them in the count)
df_without_datatime_digital = df.drop(columns=['datatime', 'mq2_digital', 'mq9_digital', 'mq8_digital','sound_digital'])

# Count the number of records with value 0 for each parameter
zero_count = (df_without_datatime_digital == 0).sum()

# Display the result
print("Number of records with value 0 for each parameter:\n", zero_count)


Number of records with value 0 for each parameter:
 temperature     1475
humidity        1475
mq2_analog      1639
sound_analog     429
mq9_analog        18
mq8_analog        38
pm25_density       0
pm10_density       0
dtype: int64


In [33]:
# Filter rows where temperature is 0
temp_zero_rows = df[(df['temperature'] == 0) & (df['humidity']==0)]

# Display the filtered rows
temp_zero_rows


Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,datatime,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
222,0.0,0.0,1014.0,1,708.0,1,91.0,2024-04-06 23:37:50,1,51.0,0,171.73,-0.10
227,0.0,0.0,395.0,0,323.0,0,375.0,2024-04-06 23:53:47,0,376.0,0,18.16,-0.10
1358,0.0,0.0,131.0,1,165.0,0,135.0,2024-04-19 12:41:37,1,161.0,1,155.95,190.82
1359,0.0,0.0,142.0,1,169.0,0,133.0,2024-04-19 12:41:44,1,158.0,1,161.77,185.84
1360,0.0,0.0,143.0,1,171.0,0,133.0,2024-04-19 12:41:51,1,157.0,1,150.97,179.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39737,0.0,0.0,28.0,1,117.0,0,167.0,2024-11-14 21:55:34,1,94.0,1,227.34,213.23
39738,0.0,0.0,31.0,1,109.0,0,167.0,2024-11-14 21:55:40,1,95.0,1,307.86,198.29
39739,0.0,0.0,32.0,1,117.0,0,169.0,2024-11-14 21:55:47,1,96.0,1,239.79,203.27
39740,0.0,0.0,35.0,1,147.0,0,182.0,2024-11-14 21:56:08,1,102.0,1,256.39,204.10


In [34]:
# Filter rows where temperature is 0
temp_zero_rows = df[(df['temperature'] == 0) & (df['humidity']==0) & (df['mq2_analog']==0)]

# Display the filtered rows
temp_zero_rows


Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,datatime,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
4644,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:03:22,0,0.0,0,4.05,115.28
4645,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:03:25,0,0.0,0,163.43,126.07
4658,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:04:29,0,0.0,0,-0.1,145.99
4659,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:04:33,0,0.0,0,197.46,145.99
4660,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:43:31,0,0.0,0,59.67,112.79
4661,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:43:34,0,0.0,0,242.28,111.96
4662,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:43:38,0,0.0,0,235.64,102.83


In [38]:
# Filter rows where temperature is 0
temp_zero_rows = df[(df['mq9_analog']==0) & (df['mq8_analog']==0)]

# Display the filtered rows
temp_zero_rows


Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,datatime,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
2150,38.0,21.0,0.0,0,70.0,1,0.0,2024-04-20 14:35:41,0,0.0,0,256.39,192.48
2151,38.0,21.0,0.0,0,3.0,1,0.0,2024-04-20 14:35:47,0,0.0,0,224.02,193.31
2152,38.0,21.0,0.0,0,1.0,1,0.0,2024-04-20 14:35:54,0,0.0,0,215.72,170.07
2153,38.0,21.0,0.0,0,11.0,1,0.0,2024-04-20 14:36:01,0,0.0,0,204.93,183.35
2154,38.0,21.0,0.0,0,0.0,1,0.0,2024-04-20 14:36:28,0,0.0,0,-0.1,194.14
2155,38.0,21.0,0.0,0,58.0,1,0.0,2024-04-20 14:36:35,0,0.0,0,277.15,194.14
2156,38.0,21.0,0.0,0,0.0,1,0.0,2024-04-20 14:36:41,0,0.0,0,219.87,201.61
4644,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:03:22,0,0.0,0,4.05,115.28
4645,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:03:25,0,0.0,0,163.43,126.07
4658,0.0,0.0,0.0,0,0.0,0,0.0,2024-08-26 22:04:29,0,0.0,0,-0.1,145.99
