In [None]:
import pandas as pd
data = pd.read_csv('https://query.data.world/s/tmlfix7gctaoqc2jfwxh4a3zdjj7zf?dws=00000')

In [None]:
print(data.columns)


Index(['timestamp', 'Main I', 'AC', 'Living Rm', 'Lights living / bath / baby',
       'Bath plgs', 'Holiday', '(unnamed II - A)', 'Fridge',
       'Kitchn / dining  Lts', 'Furnace', 'Microwave', 'Guest', 'test',
       'Guest bath', 'Mini Fridge/Dshwshr', 'Other', 'use', 'mains', 'solar'],
      dtype='object')


In [None]:
required_columns = ['Main I', 'AC', 'Living Rm', 'Lights living / bath / baby', 'Bath plgs', 'Fridge', 'Kitchn / dining  Lts', 'Furnace', 'Microwave', 'Guest', 'test',
       'Guest bath', 'Mini Fridge/Dshwshr', 'Other', 'use', 'mains', 'solar']

# Calculate total consumption
data['total_consumption'] = data[required_columns].apply(sum, axis=1)

import pandas as pd
import plotly.express as px

# Create the line graph
fig = px.line(data, x='timestamp', y='total_consumption', title='Total Consumption over Time')

# Customize the chart
fig.update_xaxes(title='Timestamp')
fig.update_yaxes(title='Total Consumption')
fig.show()



In [None]:
import plotly.express as px
import pandas as pd

# Convert the 'timestamp' column to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Extract the hour values from the 'timestamp' column and create a new column
data['timestamp_hour'] = data['timestamp'].dt.hour

# Group the data by hour and calculate the sum of total consumption
hourly_consumption = data.groupby('timestamp_hour')['total_consumption'].sum()

# Plot the hourly consumption
fig = px.line(hourly_consumption.reset_index(), x='timestamp_hour', y='total_consumption')
fig.update_xaxes(title='Hour')
fig.update_yaxes(title='Total Consumption')
fig.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Drop rows with missing values
data.dropna(subset=['total_consumption'], inplace=True)

# Standardize the 'total_consumption' feature
scaler = StandardScaler()
data['total_consumption_scaled'] = scaler.fit_transform(data['total_consumption'].values.reshape(-1, 1))

# Create a continuous target variable based on mains consumption
data['mains_continuous'] = data['mains']

# Split the data into training and testing sets
X = data['total_consumption_scaled']
y = data['mains_continuous']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the data into a two-dimensional array
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# Define the parameter grid
param_grid = {
    'C': [10, 20, 50, 100, 200],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
    'kernel': ['linear', 'rbf']
}

# Define the SVR model with initial parameters
svr = SVR()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, scoring='neg_mean_squared_error')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)



# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)

# Define the SVR model with optimal parameters
svr = SVR(C=grid_search.best_params_['C'], epsilon=grid_search.best_params_['epsilon'], kernel='linear')

# Train the SVR model on the training set
svr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svr.predict(X_test)



Best parameters: {'C': 50, 'epsilon': 0.1, 'kernel': 'linear'}


In [None]:
from sklearn.metrics import accuracy_score

# Define a threshold for binary classification
threshold = 2000

# Convert predictions to binary (0 or 1)
y_pred_binary = (y_pred > threshold).astype(int)

# Round the continuous target to make it binary
y_test_binary = (y_test > threshold).astype(int)

# Evaluate the performance of the model using mean squared error (MSE), R-squared, and accuracy
mse = mean_squared_error(y_test_binary, y_pred_binary)
r2 = r2_score(y_test_binary, y_pred_binary)
accuracy = accuracy_score(y_test_binary, y_pred_binary)

print("Mean Squared Error (MSE):", mse)
print("R-squared:", r2)
print("Accuracy:", accuracy)

Mean Squared Error (MSE): 0.038461538461538464
R-squared: 0.8461538461538461
Accuracy: 0.9615384615384616


In [None]:
from sklearn.metrics import precision_recall_fscore_support

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_test_binary, y_pred_binary, average='binary')

# Print the metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.9285714285714286
Recall: 1.0
F1 Score: 0.962962962962963


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Create a confusion matrix
cm = confusion_matrix(y_test_binary, y_pred_binary)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)


In [None]:

# Calculate and print classification report
classification_rep = classification_report(y_test_binary, y_pred_binary)
print("\nClassification Report:")
print(classification_rep)

In [None]:
import plotly.express as px
import numpy as np

# Calculate residuals (the difference between actual and predicted values)
residuals = y_test - y_pred

# Create a DataFrame with actual, predicted, and residuals
df_heatmap = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Residuals': residuals})

# Calculate the correlation matrix
correlation_matrix = df_heatmap.corr()

# Create a heatmap using Plotly Express with annotations
fig = px.imshow(
    np.array(correlation_matrix),
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    color_continuous_scale='YlOrRd',
    labels=dict(color='Correlation'),
    title='Correlation Heatmap: Actual vs. Predicted Mains Consumption'
)

# Add annotations to display correlation coefficients
for i, row in enumerate(correlation_matrix.index):
    for j, col in enumerate(correlation_matrix.columns):
        fig.add_annotation(
            x=col,
            y=row,
            text=f"{correlation_matrix.iloc[i, j]:.2f}",
            showarrow=False,
            font=dict(color='white' if abs(correlation_matrix.iloc[i, j]) > 0.5 else 'black')
        )

# Show the plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Create a DataFrame with actual and predicted values
df_visualization = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})


# Plot the actual vs predicted values as a bar chart
fig = px.bar(df_visualization, x=df_visualization.index, y=['Actual', 'Predicted'],
             labels={'value': 'Mains Consumption'},
             title='Actual vs Predicted Mains Consumption')

# Show the plot
fig.show()
