### Energy Data EDA

In [None]:
# Import Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

Data Set is from  https://www.kaggle.com/datasets/sohommajumder21/appliances-energy-prediction-data-set oroginally from https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction

In [None]:
df = pd.read_csv('energydata.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df = pd.read_csv('energydata.csv')  # Reload original data
df['date'] = pd.to_datetime(df['date'], format="%d-%m-%Y %H:%M", errors='coerce')  # Convert back to readable datetime

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df['date'].max() - df['date'].min()

In [None]:
sns.pairplot(df, diag_kind=None)  # Removes diagonal self-plots
plt.show()

In [None]:
#Checking Tendencies & Distributions using Density

# Set up the number of rows and columns for subplots
num_features = len(df.columns)
num_cols = 4  # Number of columns in the plot grid
num_rows = -(-num_features // num_cols)  # Ceiling division to get rows

# Set figure size
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 3))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot density for each feature
for i, column in enumerate(df.columns):
    sns.kdeplot(df[column], ax=axes[i], fill=True)
    axes[i].set_title(f'Distribution of {column}')
    axes[i].set_xlabel(column)

# Hide any empty subplots (if feature count is not a multiple of num_cols)
for i in range(num_features, len(axes)):
    fig.delaxes(axes[i])

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Visualize Data as a whole with boxplot

import math

df_clean = df.drop(columns=['date']) #drop the date

# Set the size of the figure
num_features = len(df.columns)
rows = math.ceil(num_features / 4)  # Calculate the number of rows needed (4 columns per row)
cols = 4  # Set number of columns

plt.figure(figsize=(15, 5 * rows))  # Adjust the height based on rows

# Loop through each feature and create individual boxplots
for i, column in enumerate(df.columns):
    plt.subplot(rows, cols, i + 1)  # Adjust based on number of features
    sns.boxplot(y=df[column])
    plt.title(column)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
#Looking for Correlations among features

# Calculate the correlation matrix
corr_matrix = df_clean.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set the size of the plot
plt.figure(figsize=(12, 10))

# Plot the heatmap with the mask
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask=mask, fmt='.2f', linewidths=0.5, cbar_kws={'shrink': 0.8})

# Set the title
plt.title('Correlation Heatmap with Mask')

# Show the plot
plt.show()


In [None]:
from sklearn.decomposition import PCA

#checking separability

# Drop 'date' column and any other non-numeric columns
df_numeric = df.drop(columns=["date", "Appliances"])  # Modify as needed to exclude other non-numeric columns

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_numeric)

# Create DataFrame for PCA results
pca_df = pd.DataFrame(pca_result, columns=["PCA1", "PCA2"])

# Add the target column (assuming it's still available in `cleaned_df`)
pca_df["Target"] = df["Appliances"]  # Replace if target column name differs

# Plot PCA scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x="PCA1", y="PCA2", hue="Target", data=pca_df, alpha=0.5, palette="viridis")
plt.title("PCA Projection (2D)")
plt.show()

In [None]:
from sklearn.manifold import TSNE

# Use PCA results as input for t-SNE (2D)
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(pca_result)

# Create DataFrame for t-SNE results
tsne_df = pd.DataFrame(tsne_result, columns=["TSNE1", "TSNE2"])

# Add the target column (assuming it's still available in `cleaned_df`)
tsne_df["Target"] = df["Appliances"]  # Replace if target column name differs

# Plot t-SNE scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x="TSNE1", y="TSNE2", hue="Target", data=tsne_df, alpha=0.5, palette="viridis")
plt.title("t-SNE Projection (2D)")
plt.show()

In [None]:
# Remove 'date' column (non-numeric) from the dataset
numeric_df = df.drop(columns=['date'])

# Set up the plot for numeric columns only
plt.figure(figsize=(15, 5 * rows))

for i, column in enumerate(numeric_df.columns):
    plt.subplot(rows, cols, i + 1)  # Create a subplot for each feature
    sns.scatterplot(x=numeric_df.index, y=numeric_df[column])
    plt.title(column)

# Adjust layout for better visibility
plt.tight_layout()
plt.show()

In [None]:
# Check the distribution of the target variable
target_distribution = df_clean["Appliances"].value_counts()
print(target_distribution)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df["Appliances"], bins=50, kde=True, color="royalblue")
plt.title("Distribution of Appliances Energy Consumption")
plt.xlabel("Energy Consumption (Wh)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
#Now dropping highly corelated features 

# Check for highly correlated columns (features)

# Calculate the correlation matrix for the numeric columns
corr_matrix = df.corr()

# Set the figure size for better readability
plt.figure(figsize=(12, 8))

# Create a mask to hide the upper triangle of the heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap using a soft, eye-friendly color palette
sns.heatmap(corr_matrix, annot=True, cmap='YlGnBu', fmt='.2f', 
            linewidths=0.5, mask=mask, cbar_kws={'shrink': 0.8}, 
            annot_kws={'size': 10, 'weight': 'bold'}, vmin=-1, vmax=1)

# Add a title for the heatmap
plt.title('Correlation Heatmap of All Features', fontsize=16, weight='bold')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
for column in df.columns:
    print(f"{column}: {df[column].dtype}")  # Check the data type of each column