In [1]:
import pandas as pd

# Load the CSV

df = pd.read_csv('../data/iot_telemetry_data.csv')

# Show basic info
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)
df.head()


Shape: (405184, 9)

Columns:
 Index(['ts', 'device', 'co', 'humidity', 'light', 'lpg', 'motion', 'smoke',
       'temp'],
      dtype='object')


Unnamed: 0,ts,device,co,humidity,light,lpg,motion,smoke,temp
0,1594512000.0,b8:27:eb:bf:9d:51,0.004956,51.0,False,0.007651,False,0.020411,22.7
1,1594512000.0,00:0f:00:70:91:0a,0.00284,76.0,False,0.005114,False,0.013275,19.700001
2,1594512000.0,b8:27:eb:bf:9d:51,0.004976,50.9,False,0.007673,False,0.020475,22.6
3,1594512000.0,1c:bf:ce:15:ec:4d,0.004403,76.800003,True,0.007023,False,0.018628,27.0
4,1594512000.0,b8:27:eb:bf:9d:51,0.004967,50.9,False,0.007664,False,0.020448,22.6


In [2]:
df.isnull().sum()

ts          0
device      0
co          0
humidity    0
light       0
lpg         0
motion      0
smoke       0
temp        0
dtype: int64

In [3]:
df['ts'].head()
df['ts'] = pd.to_datetime(df['ts'])  # Convert to datetime if not already
df.dtypes


ts          datetime64[ns]
device              object
co                 float64
humidity           float64
light                 bool
lpg                float64
motion                bool
smoke              float64
temp               float64
dtype: object

In [4]:
df.describe()

Unnamed: 0,ts,co,humidity,lpg,smoke,temp
count,405184,405184.0,405184.0,405184.0,405184.0,405184.0
mean,1970-01-01 00:00:01.594858016,0.004639,60.511694,0.007237,0.019264,22.453987
min,1970-01-01 00:00:01.594512094,0.001171,1.1,0.002693,0.006692,0.0
25%,1970-01-01 00:00:01.594685999,0.003919,51.0,0.006456,0.017024,19.9
50%,1970-01-01 00:00:01.594857988,0.004812,54.9,0.007489,0.01995,22.2
75%,1970-01-01 00:00:01.595030576,0.005409,74.300003,0.00815,0.021838,23.6
max,1970-01-01 00:00:01.595203417,0.01442,99.900002,0.016567,0.04659,30.6
std,,0.00125,11.366489,0.001444,0.004086,2.698347


In [5]:
print(df['ts'].head())
print(df['ts'].dtype)


0   1970-01-01 00:00:01.594512094
1   1970-01-01 00:00:01.594512094
2   1970-01-01 00:00:01.594512098
3   1970-01-01 00:00:01.594512099
4   1970-01-01 00:00:01.594512101
Name: ts, dtype: datetime64[ns]
datetime64[ns]


In [7]:
df_raw = pd.read_csv('../data/iot_telemetry_data.csv', dtype={'ts': str})
print(df_raw['ts'].head())

0    1.5945120943859746E9
1    1.5945120947355676E9
2    1.5945120980735729E9
3     1.594512099589146E9
4     1.594512101761235E9
Name: ts, dtype: object


In [8]:
# First load the CSV normally
df = pd.read_csv('../data/iot_telemetry_data.csv')

# Convert 'ts' column from float timestamps (seconds since epoch) to datetime
df['ts'] = pd.to_datetime(df['ts'], unit='s')

# Check the output
print(df['ts'].head())

0   2020-07-12 00:01:34.385974646
1   2020-07-12 00:01:34.735567570
2   2020-07-12 00:01:38.073572874
3   2020-07-12 00:01:39.589145899
4   2020-07-12 00:01:41.761234999
Name: ts, dtype: datetime64[ns]


In [9]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 13


In [10]:
df = df.drop_duplicates()
print(f"New shape after dropping duplicates: {df.shape}")

New shape after dropping duplicates: (405171, 9)


In [11]:
import pandas as pd

# Load your CSV into df (make sure path is correct)
df = pd.read_csv('../data/iot_telemetry_data.csv')

# If 'ts' column is timestamp, parse it
df['ts'] = pd.to_datetime(df['ts'])

In [12]:
import pandas as pd

# Assuming df is your DataFrame already loaded

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"Outliers in {column}: {len(outliers)}")
    return outliers

numeric_cols = ['co', 'humidity', 'lpg', 'smoke', 'temp']

for col in numeric_cols:
    outliers = detect_outliers_iqr(df, col)
    # Optional: display or save these outliers
    # print(outliers.head())

Outliers in co: 10480
Outliers in humidity: 44
Outliers in lpg: 12624
Outliers in smoke: 12271
Outliers in temp: 8616


In [13]:
print(df['device'].nunique())
df['device'].value_counts()

3


device
b8:27:eb:bf:9d:51    187451
00:0f:00:70:91:0a    111815
1c:bf:ce:15:ec:4d    105918
Name: count, dtype: int64

In [14]:
print(df.isnull().sum())

ts          0
device      0
co          0
humidity    0
light       0
lpg         0
motion      0
smoke       0
temp        0
dtype: int64
