In [16]:
# Install kagglehub if not already installed
!pip install kagglehub -q

import kagglehub
import os
import pandas as pd

# Download the dataset
try:
    path = kagglehub.dataset_download("rabieelkharoua/predict-smart-home-device-efficiency-dataset")
    print(f"Dataset successfully downloaded to: {path}")
    
    # List the files in the downloaded directory
    files = os.listdir(path)
    print(f"Files in the dataset: {files}")
    
    # Read the CSV file(s)
    # Assuming there's a CSV file in the dataset - adjust the filename based on what's in 'files'
    if files:  # Check if there are any files
        # Look for CSV files
        csv_files = [f for f in files if f.endswith('.csv')]
        
        if csv_files:
            # Read the first CSV file found
            file_path = os.path.join(path, csv_files[0])
            data = pd.read_csv(file_path)
            print(f"Successfully read {csv_files[0]}")
            print(data.head())  # Display the first few rows
        else:
            print("No CSV files found in the dataset.")
    else:
        print("No files found in the downloaded dataset.")
        
except Exception as e:
    print(f"Error: {e}")

Dataset successfully downloaded to: C:\Users\Acer\.cache\kagglehub\datasets\rabieelkharoua\predict-smart-home-device-efficiency-dataset\versions\1
Files in the dataset: ['smart_home_device_usage_data.csv']
Successfully read smart_home_device_usage_data.csv
   UserID       DeviceType  UsageHoursPerDay  EnergyConsumption  \
0       1    Smart Speaker         15.307188           1.961607   
1       2           Camera         19.973343           8.610689   
2       3  Security System         18.911535           2.651777   
3       4           Camera          7.011127           2.341653   
4       5           Camera         22.610684           4.859069   

   UserPreferences  MalfunctionIncidents  DeviceAgeMonths  SmartHomeEfficiency  
0                1                     4               36                    1  
1                1                     0               29                    1  
2                1                     0               20                    1  
3               

In [17]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [18]:
# 1. Problem Definition
"""
Problem: Predict smart home device efficiency and optimize energy consumption
Objectives:
- Analyze patterns in smart home device usage
- Identify factors affecting energy consumption
- Build a model to predict device efficiency
- Provide recommendations for energy optimization
"""
print("1. Problem Definition: Predict smart home device efficiency and optimize energy consumption")

# 2. Data Collection and Understanding
print("\n2. Data Collection and Understanding")

1. Problem Definition: Predict smart home device efficiency and optimize energy consumption

2. Data Collection and Understanding


In [21]:
# Fill numerical columns with mean
# Need to define numerical columns first
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].mean(), inplace=True)
            
# Fill categorical columns with mode
cat_cols = data.select_dtypes(include=['object']).columns
for col in cat_cols:
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].mode()[0], inplace=True)
            
print("Missing values handled.")
        
# 2. Identify features and target
# Assuming the target is 'efficiency' or similar - adjust as needed based on actual column names
possible_targets = ['efficiency', 'device_efficiency', 'energy_efficiency']
target_col = None
        
for col in possible_targets:
    if col in data.columns:
        target_col = col
        break
        
if target_col is None:
    print("Target column not found. Please specify the target column.")
    # Display columns to help identify the target
    print("Available columns:", data.columns.tolist())
else:
    X = data.drop(target_col, axis=1)
    y = data[target_col]
            
    # 3. Identify categorical and numerical features
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
            
    print(f"Target column: {target_col}")
    print(f"Categorical features: {categorical_cols}")
    print(f"Numerical features: {numerical_cols}")
            
    # 4. Create preprocessing pipeline
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
            
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

Missing values handled.
Target column not found. Please specify the target column.
Available columns: ['UserID', 'DeviceType', 'UsageHoursPerDay', 'EnergyConsumption', 'UserPreferences', 'MalfunctionIncidents', 'DeviceAgeMonths', 'SmartHomeEfficiency']


In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump

np.random.seed(42)
X = pd.DataFrame({
    'numeric_feature1': np.random.normal(0, 1, 100),
    'numeric_feature2': np.random.normal(0, 1, 100),
    'categorical_feature': np.random.choice(['A', 'B', 'C'], 100)
})
y = np.random.randint(0, 2, 100)  # Binary target for example

# Define preprocessor
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nData split into training set ({X_train.shape[0]} samples) and test set ({X_test.shape[0]} samples)")

# 6. Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")

# 7. Save preprocessor for later use
dump(preprocessor, 'preprocessor.joblib')
print("Preprocessor saved as 'preprocessor.joblib'")


Data split into training set (80 samples) and test set (20 samples)
Processed training data shape: (80, 5)
Processed test data shape: (20, 5)
Preprocessor saved as 'preprocessor.joblib'
