In [2]:
import timeit
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Load the dataset
dataset = pd.read_csv('./data/Processed_Combined_IoT_dataset.csv')

# Exploratory Data Analysis
print("Dataset shape:", dataset.shape)
print("Columns:", list(dataset.columns))

target_cols = dataset.columns[-1:].tolist()
feature_cols = dataset.columns[:-1].tolist()

# Split Dataset into Features and Target
X = dataset.drop('label', axis=1)
y = dataset['label']

print("X head:\n", X.head())
print("y head:\n", y.head())

# Splitting Data into Training and Test Sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tuning Logistic Regression with Grid Search
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'class_weight': [None, 'balanced']
}
clf = LogisticRegression(solver='saga', max_iter=1000, random_state=1)

start = timeit.default_timer()
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
train_time = timeit.default_timer() - start

# Best model from grid search
best_clf = grid_search.best_estimator_
print("\nBest Parameters from Grid Search:", grid_search.best_params_)

# Evaluate on Test Set
start = timeit.default_timer()
y_pred = best_clf.predict(X_test_scaled)
test_time = timeit.default_timer() - start

print("\nSingle Split Evaluation (Tuned Logistic Regression):")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Cohen's Kappa:", metrics.cohen_kappa_score(y_test, y_pred))
print("ROC AUC:", metrics.roc_auc_score(y_test, y_pred))
print("Train Time (s):", train_time)
print("Test Time (s):", test_time)

# K-Fold Cross-Validation with Tuned Model
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=1)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
train_times = []
test_times = []

print(f"\nStarting {k}-Fold Cross-Validation with Tuned Logistic Regression...")
for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    X_train_kf, X_val_kf = X.iloc[train_index], X.iloc[val_index]
    y_train_kf, y_val_kf = y.iloc[train_index], y.iloc[val_index]

    # Scale features within each fold
    X_train_kf_scaled = scaler.fit_transform(X_train_kf)
    X_val_kf_scaled = scaler.transform(X_val_kf)

    # Train the tuned model
    start = timeit.default_timer()
    clf_kf = LogisticRegression(**grid_search.best_params_, solver='saga', max_iter=1000, random_state=1)
    clf_kf.fit(X_train_kf_scaled, y_train_kf)
    train_times.append(timeit.default_timer() - start)

    # Predict and evaluate
    start = timeit.default_timer()
    y_pred_kf = clf_kf.predict(X_val_kf_scaled)
    test_times.append(timeit.default_timer() - start)

    # Store metrics
    accuracy_scores.append(metrics.accuracy_score(y_val_kf, y_pred_kf))
    precision_scores.append(metrics.precision_score(y_val_kf, y_pred_kf))
    recall_scores.append(metrics.recall_score(y_val_kf, y_pred_kf))
    f1_scores.append(metrics.f1_score(y_val_kf, y_pred_kf))

    print(f"Fold {fold} - Accuracy: {accuracy_scores[-1]:.4f}, "
          f"Precision: {precision_scores[-1]:.4f}, "
          f"Recall: {recall_scores[-1]:.4f}, "
          f"F1 Score: {f1_scores[-1]:.4f}")

# Summary of K-Fold Results
print(f"\n{k}-Fold Cross-Validation Summary (Tuned Logistic Regression):")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f} (±{np.std(accuracy_scores):.4f})")
print(f"Average Precision: {np.mean(precision_scores):.4f} (±{np.std(precision_scores):.4f})")
print(f"Average Recall: {np.mean(recall_scores):.4f} (±{np.std(recall_scores):.4f})")
print(f"Average F1 Score: {np.mean(f1_scores):.4f} (±{np.std(f1_scores):.4f})")
print(f"Average Train Time (s): {np.mean(train_times):.4f}")
print(f"Average Test Time (s): {np.mean(test_times):.4f}")

Dataset shape: (401119, 18)
Columns: ['FC1_Read_Input_Register', 'FC2_Read_Discrete_Value', 'FC3_Read_Holding_Register', 'FC4_Read_Coil', 'current_temperature', 'door_state', 'fridge_temperature', 'humidity', 'latitude', 'light_status', 'longitude', 'motion_status', 'pressure', 'sphone_signal', 'temp_condition', 'temperature', 'thermostat_status', 'label']
X head:
    FC1_Read_Input_Register  FC2_Read_Discrete_Value  \
0                 0.495216                 0.499092   
1                 0.495216                 0.499092   
2                 0.495216                 0.499092   
3                 0.495216                 0.499092   
4                 0.495216                 0.499092   

   FC3_Read_Holding_Register  FC4_Read_Coil  current_temperature  door_state  \
0                   0.488897       0.499405             0.344399           0   
1                   0.488897       0.499405             0.344399           0   
2                   0.488897       0.499405             0.344