In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# Load your dataset
data = pd.read_csv('/kaggle/input/preprocessed-dallas-public-safety/filtered_data_3.csv')

In [None]:
data = data.dropna()


In [None]:
data.head

In [None]:
# Define the features for clustering
features_for_clustering = ['Zip Code', 'Time Bin', 'Day1 of the Week', 
                           'Zipcode-Percentage',
                           'Zipcode-Day-Percentage',
                           'Zip-Time-Percentage',
                           'Incident_Score']


In [None]:
# Separate numerical and categorical features
numeric_features = ['Zipcode-Percentage',
                           'Zipcode-Day-Percentage',
                           'Zip-Time-Percentage',
                           'Incident_Score']

In [None]:
categorical_features = ['Zip Code', 'Time Bin', 'Day1 of the Week']


In [None]:
# Create transformers for numeric and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

In [None]:
# Create a preprocessor that applies the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Fit the preprocessor to the data and transform the features
data_encoded_normalized = preprocessor.fit_transform(data[features_for_clustering])


In [None]:

# Create a Gaussian Mixture Model with the desired number of components (clusters)
n_components = 5  # You can adjust this
gmm = GaussianMixture(n_components=n_components, random_state=42)

In [None]:
# Fit the GMM model and get cluster assignments
data['Cluster'] = gmm.fit_predict(data_encoded_normalized.toarray())

In [None]:
# Create a function to sample Safety Scores from the cluster's distribution
def sample_safety_score(cluster_mean, cluster_cov):
    return np.random.normal(cluster_mean, np.sqrt(cluster_cov))

In [None]:
# Iterate through each cluster to assign Safety Scores
for cluster_id, cluster_data in data.groupby('Cluster'):
    cluster_mean = cluster_data['Incident_Score'].mean()
    cluster_cov = cluster_data['Incident_Score'].var()
    data.loc[data['Cluster'] == cluster_id, 'Predicted Safety Score'] = cluster_data.apply(lambda row: sample_safety_score(cluster_mean, cluster_cov), axis=1)


In [None]:
# Save the updated dataset with the predicted Safety Scores
data.to_csv('/kaggle/working/dataset_with_safety_scores.csv', index=False)