In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('2_sensor_data.csv')
# 1. Calculate sensor data
sensor_stats = data.groupby('SensorType', observed=False)['Value'].agg(['count', 'mean'])
print("Sensor Data: Count and Mean")
print(sensor_stats)
# 2. Calculate temperature and humidity statistics
location_stats = data[data['SensorType'].isin(['Temperature', 'Humidity'])].groupby(['Location', 'SensorType'])['Value'].mean().unstack()
print("Calculate temperature and humidity for every location:")
print(location_stats)

Sensor Data: Count and Mean
              count       mean
SensorType                    
Humidity       2065  48.978302
Light          1950  49.439011
SoilMoisture   1951  49.325119
SoilPH         2029  49.865460
Temperature    2005  49.847153
Calculate temperature and humidity for every location:
SensorType   Humidity  Temperature
Location                          
Field1      49.036008    50.482617
Field2      47.379411    49.874282
Field3      49.583266    48.761804
Field4      49.955498    50.241608


In [3]:
# 3. Data cleaning and abnormal value handling
data['is_abnormal'] = np.where(
    ((data['SensorType'] == 'Temperature') & ((data['Value'] < -10) | (data['Value']>50))) |
    ((data['SensorType'] == 'Humidity') & ((data['Value'] < 0) | (data['Value'] > 100))),
    True, False
)
print("Abnormal value count:", data['is_abnormal'].sum())
# data['Value'].fillna(method='ffill', inplace=True)
# data['Value'].fillna(method='bfill', inplace=True)
data['Value'] = data['Value'].ffill()
data['Value'] = data['Value'].bfill()
cleaned_data = data[data['is_abnormal'] == False]
cleaned_data = cleaned_data.drop(columns=['is_abnormal'])
cleaned_data.to_csv('2_sensor_data_cleaned.csv', index=False)
print("Data cleaning complete. Saved as '2_sensor_data_cleaned.csv'")
original_count = len(data)
abnormal_count = data['is_abnormal'].sum()
cleaned_count = len(cleaned_data)
stats_df = pd.DataFrame({
    'Stage':['Original Data', 'Abnormal Data', 'After Data Cleaning'],
    'Row': [original_count, abnormal_count, cleaned_count],
    'Ratio': ['100%', f'{abnormal_count/original_count:.2%}', f'{cleaned_count/original_count:.2%}']
})
print("Data cleaning status:")
print(stats_df)

Abnormal value count: 1011
Data cleaning complete. Saved as '2_sensor_data_cleaned.csv'
Data cleaning status:
                 Stage    Row   Ratio
0        Original Data  10000    100%
1        Abnormal Data   1011  10.11%
2  After Data Cleaning   8989  89.89%
