## Carbon Footprint Assessment:
Develop a metric for evaluating building carbon footprints, accounting for energy and water consumption, size, and other factors.

In [173]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import json

In [174]:
# Load the schema from the JSON file
with open('./data/schema_cleaned2.json') as schema_file:
    schema = json.load(schema_file)

# Convert schema to the format required by pandas
# Note: In this case, we're assuming all the dtypes are compatible with pandas dtypes.
# If there are any discrepancies, you may need to manually adjust the dtypes.
pandas_schema = {key: value for key, value in schema.items()}
feature_list = [key for key, value in schema.items()]

## Data Preparation
Since not all our data is numerical, we'll need to do some one-hot encoding.
Given all the categorical columns, we're going to use pandas library for one-hot encoding.

In [175]:
# Now read the cleaned CSV file using the schema
df_cleaned = pd.read_csv('./data/data_cleaned2.csv', dtype=pandas_schema)

# Select the categorical columns that need one-hot encoding
categorical_columns = [key for key, value in schema.items() if value == 'object']
numerical_columns = [key for key, value in schema.items() if value == 'float64']


## Feature Selection & Scaling
Choosing the relevant features for clustering the different types of buildings

In [176]:
# Select the 'year_built' and 'net_emissions_metric_tons' features
selected_features = df_cleaned[numerical_columns]

# Scale our data for a mean of 0
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_features)

In [177]:
# Perform dimensionality reduction using PCA
num_components = 5
pca = PCA(n_components=num_components)  # Adjust the number of components as needed
reduced_features = pca.fit_transform(selected_features)

In [None]:
# Fit the PCA model to your data
pca.fit(selected_features)

# Get the transformed data (principal components)
components = pca.transform(selected_features)

# Calculate the explained variance for each component
explained_variance = pca.explained_variance_ratio_

# Plot the explained variance to decide on the number of components to keep
plt.bar(range(1, num_components + 1), explained_variance)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.show()

In [None]:
# Now, you can examine which original features contribute most to the first few components
feature_contributions = abs(pca.components_[num_components - 1])
sorted_features = sorted(zip(selected_features.columns, feature_contributions), key=lambda x: -x[1])

# Print the most important features for the chosen component
print(f"Most important features for Principal Component:")
for feature, contribution in sorted_features:
    print(f"{feature}: {contribution:.2f}")

## K-Means Clustering
After dimensionality reduction, we proceed with K-Means clustering on the reduced feature set. The goal is to cluster data points in this lower-dimensional space into K clusters.

In [None]:
# Choose the number of clusters (K)
K = 5  # Replace with the desired number of clusters

# Create a K-Means model
kmeans_model = KMeans(n_clusters=K, random_state=42)

# Fit the K-Means model to the selected features
kmeans_model.fit(selected_features)

# Get cluster assignments for each data point in the dataset
cluster_assignments = kmeans_model.predict(selected_features)

# Create a scatter plot to visualize the clusters
plt.scatter(selected_features['year_built'], selected_features['net_emissions_metric_tons'], c=cluster_assignments, cmap='rainbow')
plt.xlabel('Year Built')
plt.ylabel('Net Emissions (metric tons)')
plt.title('K-Means Clustering')
plt.show()
