In [3]:
import pandas as pd

df = pd.read_csv("raw_iot_data.csv")
print(df.head())
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())



                    timestamp device_id    data_type data_value
0  2025-03-04 20:41:46.466097  Device_4       Energy     22.5°C
1  2025-03-04 21:41:46.466095  Device_5     Humidity        45%
2  2025-03-04 22:41:46.466093  Device_3  Temperature   18.3 kWh
3  2025-03-04 23:41:46.466090  Device_5     Humidity        NaN
4  2025-03-05 00:41:46.466089  Device_5     Humidity     23.1°C

Columns: ['timestamp', 'device_id', 'data_type', 'data_value']

Missing values:
 timestamp     0
device_id     0
data_type     0
data_value    2
dtype: int64


In [None]:
import pandas as pd  
df = pd.read_csv("raw_iot_data.csv")

print("📋 IoT Device Data:\n")
print("timestamp | device_id | data_type | data_value")
print("-" * 50)

for index, row in df.iterrows():
    timestamp = row['timestamp']
    device_id = row['device_id']
    data_type = row['data_type']
    data_value = row['data_value']
    
    print(f"{timestamp} | {device_id} | {data_type} | {data_value}")

📋 IoT Device Data:

timestamp | device_id | data_type | data_value
--------------------------------------------------
2025-03-04 20:41:46.466097 | Device_4 | Energy | 22.5°C
2025-03-04 21:41:46.466095 | Device_5 | Humidity | 45%
2025-03-04 22:41:46.466093 | Device_3 | Temperature | 18.3 kWh
2025-03-04 23:41:46.466090 | Device_5 | Humidity | nan
2025-03-05 00:41:46.466089 | Device_5 | Humidity | 23.1°C
2025-03-05 01:41:46.466087 | Device_2 | Humidity | 50%
2025-03-05 02:41:46.466086 | Device_3 | Humidity | 19.0 kWh
2025-03-05 03:41:46.466084 | Device_3 | Temperature | 24.0°C
2025-03-05 04:41:46.466069 | Device_3 | Temperature | nan
2025-03-05 05:41:46.466061 | Device_5 | Humidity | 47%


In [12]:
import pandas as pd
import re

df = pd.read_csv("raw_iot_data.csv")

print("\n🔍 Missing values per column:")
print(df.isnull().sum())

df['data_value'].replace('', pd.NA, inplace=True)

df = df.dropna(subset=['data_value'])

def extract_number(value):
    match = re.search(r"[-+]?\d*\.\d+|\d+", str(value))  # match float or int
    return float(match.group()) if match else None

df['numeric_value'] = df['data_value'].apply(extract_number)
df['timestamp'] = pd.to_datetime(df['timestamp'])

df['hour'] = df['timestamp'].dt.floor('H')
grouped = df.groupby(['hour', 'data_type'])['numeric_value'].mean().reset_index()

print("\n📊 Hourly Average Readings by Data Type:")
print(grouped)

print("\n🧯 Duplicate rows found:", df.duplicated().sum())

df = df.drop_duplicates()

def detect_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return series[(series < lower) | (series > upper)]

print("\n Potential Outliers:")
for dtype in df['data_type'].unique():
    outliers = detect_outliers(df[df['data_type'] == dtype]['numeric_value'])
    if not outliers.empty:
        print(f"\nOutliers in '{dtype}':")
        print(outliers)




🔍 Missing values per column:
timestamp     0
device_id     0
data_type     0
data_value    2
dtype: int64

📊 Hourly Average Readings by Data Type:
                 hour    data_type  numeric_value
0 2025-03-04 20:00:00       Energy           22.5
1 2025-03-04 21:00:00     Humidity           45.0
2 2025-03-04 22:00:00  Temperature           18.3
3 2025-03-05 00:00:00     Humidity           23.1
4 2025-03-05 01:00:00     Humidity           50.0
5 2025-03-05 02:00:00     Humidity           19.0
6 2025-03-05 03:00:00  Temperature           24.0
7 2025-03-05 05:00:00     Humidity           47.0

🧯 Duplicate rows found: 0

 Potential Outliers:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['data_value'].replace('', pd.NA, inplace=True)
  df['hour'] = df['timestamp'].dt.floor('H')
