In [7]:
from datetime import datetime, timedelta
import geopandas as gpd
import json
import pandas as pd
import mapclassify
import matplotlib.pyplot as plt
import numpy as np
import os
import requests
from sklearn.decomposition import PCA
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", None)
np.set_printoptions(threshold=np.inf)

## Load Data

In [8]:
# Parameters
parent_dir = os.path.abspath('..')  # get the absolute path of the parent directory

In [9]:
file_path = os.path.join(parent_dir, 'Data', 'rodents_per_year_merged.csv')  # construct the file path
roadents_df = pd.read_csv(file_path)
print(len(roadents_df))
roadents_df.head()

38958


Unnamed: 0,spatial_id,year,l_Commercial_sum,l_Other_sum,l_Outdoor_sum,l_Residential_sum,l_Residential-Mixed_sum,l_Vacant_Space_sum,d_Friday_sum,d_Monday_sum,d_Saturday_sum,d_Sunday_sum,d_Thursday_sum,d_Tuesday_sum,d_Wednesday_sum,t_Evening_sum,t_Midday_sum,t_Morning_sum,num_sightings,s_Dead_Animal:Residential_sum,s_Dead_Animal:Street_sum,s_Dog_waste:Street_sum,s_Illegal_Dumping:Street_sum,s_Trash:Residential_sum,s_Trash:Street_sum,s_Trash_MissedService:Street_sum,s_Trash_Overflowing:Street_sum,s_Trash_Time:Street_sum,s_Trash_Unsecure:Residential_sum,s_Trash_Unsecure:Street_sum,num_dsny_complaints
0,360050001000,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,360050001000,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,360050001000,2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,360050001000,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,360050001000,2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Run PCA

In [10]:
# Load your high-dimensional data into a pandas DataFrame
df = roadents_df

# Center and scale the data
X = df.values
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# Create a PCA object and fit it to the data
pca = PCA()
pca.fit(X)

# Access the principal components and their explained variance
pcs = pca.components_
variance = pca.explained_variance_ratio_

# Print the explained variance for each principal component
for i, var in enumerate(variance):
    print("PC", i+1, "explains", round(var*100, 2), "% of the variance")

# Determine the most important columns
importance_scores = np.abs(pcs)
column_importance = pd.DataFrame(importance_scores.T, index=df.columns)
column_importance['Importance'] = column_importance.sum(axis=1)
column_importance = column_importance.sort_values(by='Importance', ascending=False)
print(column_importance)

PC 1 explains 30.38 % of the variance
PC 2 explains 9.96 % of the variance
PC 3 explains 4.61 % of the variance
PC 4 explains 3.66 % of the variance
PC 5 explains 3.4 % of the variance
PC 6 explains 3.2 % of the variance
PC 7 explains 3.11 % of the variance
PC 8 explains 3.06 % of the variance
PC 9 explains 2.96 % of the variance
PC 10 explains 2.85 % of the variance
PC 11 explains 2.78 % of the variance
PC 12 explains 2.67 % of the variance
PC 13 explains 2.63 % of the variance
PC 14 explains 2.55 % of the variance
PC 15 explains 2.3 % of the variance
PC 16 explains 2.18 % of the variance
PC 17 explains 2.0 % of the variance
PC 18 explains 1.94 % of the variance
PC 19 explains 1.81 % of the variance
PC 20 explains 1.75 % of the variance
PC 21 explains 1.67 % of the variance
PC 22 explains 1.64 % of the variance
PC 23 explains 1.5 % of the variance
PC 24 explains 1.47 % of the variance
PC 25 explains 1.4 % of the variance
PC 26 explains 1.37 % of the variance
PC 27 explains 1.15 % of t

In [19]:
# Load your high-dimensional data into a pandas DataFrame
df = roadents_df.filter(regex='^(?!t_|d_).*')
df = df.drop(['year','num_sightings','num_dsny_complaints'], axis=1)
print(df.dtypes)

# Center and scale the data
X = df.values
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# Create a PCA object and fit it to the data
pca = PCA()
pca.fit(X)

# Access the principal components and their explained variance
pcs = pca.components_
variance = pca.explained_variance_ratio_

# Print the explained variance for each principal component
for i, var in enumerate(variance):
    print("PC", i+1, "explains", round(var*100, 2), "% of the variance")

# Determine the most important columns
importance_scores = np.abs(pcs)
column_importance = pd.DataFrame(importance_scores.T, index=df.columns)

# Print the top 10 columns and their importance for the first 5 components
for i in range(5):
    print("Top 10 columns for PC", i+1)
    importance = column_importance[i].nlargest(10)
    print(importance)
    print()


spatial_id                            int64
l_Commercial_sum                    float64
l_Other_sum                         float64
l_Outdoor_sum                       float64
l_Residential_sum                   float64
l_Residential-Mixed_sum             float64
l_Vacant_Space_sum                  float64
s_Dead_Animal:Residential_sum       float64
s_Dead_Animal:Street_sum            float64
s_Dog_waste:Street_sum              float64
s_Illegal_Dumping:Street_sum        float64
s_Trash:Residential_sum             float64
s_Trash:Street_sum                  float64
s_Trash_MissedService:Street_sum    float64
s_Trash_Overflowing:Street_sum      float64
s_Trash_Time:Street_sum             float64
s_Trash_Unsecure:Residential_sum    float64
s_Trash_Unsecure:Street_sum         float64
dtype: object
PC 1 explains 15.83 % of the variance
PC 2 explains 10.53 % of the variance
PC 3 explains 6.61 % of the variance
PC 4 explains 6.06 % of the variance
PC 5 explains 5.71 % of the variance
PC 6 ex