In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessed-dallas-public-safety/filtered_data_3.csv


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder




In [3]:
data = pd.read_csv('/kaggle/input/preprocessed-dallas-public-safety/filtered_data_3.csv')

In [4]:
data = data.dropna()

In [5]:
unique_zip_codes = data['Zip Code'].unique()

In [6]:
# Define a function to round the time
def round_time(time_str):
    hours, minutes = map(int, time_str.split(':'))
    if minutes > 30:
        if hours == 23:
            return 0
        else:
            return hours + 1
    else:
        return hours

# Apply the round_time function to create the "Time Integer" column
data['Time Integer'] = data['Time1 of Occurrence'].apply(round_time)

In [7]:
# Perform label encoding for categorical columns (Day of the Week and Time Bin)
label_encoder = LabelEncoder()
data['Day1 of the Week'] = label_encoder.fit_transform(data['Day1 of the Week'])
data['Time Integer'] = label_encoder.fit_transform(data['Time Integer'])# Perform one-hot encoding for the Zip Code column
data = pd.get_dummies(data, columns=['Zip Code'], prefix='Zip')

In [8]:
data['Time Integer']

0          20
1          12
2          12
3           2
4          17
           ..
1048570    19
1048571    18
1048572     7
1048573    10
1048574     6
Name: Time Integer, Length: 1044139, dtype: int64

In [9]:
# Define the features and target variable
features = ['Day1 of the Week', 'Time Integer'] + [col for col in data.columns if col.startswith('Zip_')]
target = 'Safety Score'

# Define the ranges and corresponding labels for 'Incident_Score'
score_ranges = [(0, 10), (11, 20), (21, 30), (31, 40), (41, 50), (51, 60), (61, 70), (71, 80), (81, 90), (91, 100)]
score_labels = ['Very Safe', 'Safe', 'Fairly Safe', 'Somewhat Safe', 'Neutral', 'Somewhat Unsafe', 'Fairly Unsafe', 'Unsafe', 'Very Unsafe', 'Extremely Unsafe']


In [10]:
# Create a function to assign the label based on the score
def assign_safety_score(score):
    for i, (start, end) in enumerate(score_ranges):
        if start <= score < end:
            return score_labels[i]
    return None  # Handle values outside the defined ranges


In [11]:
# Apply the function to create the 'Safety Score' column
data['Safety Score'] = data['Incident_Score'].apply(assign_safety_score)

In [12]:
# Drop rows where 'Safety Score' is None (outside defined ranges)
data.dropna(subset=['Safety Score'], inplace=True)

In [13]:
# Split the dataset into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
X_train

Unnamed: 0,Day1 of the Week,Time Integer,Zip_0.0,Zip_11576.0,Zip_12400.0,Zip_16066.0,Zip_30305.0,Zip_33455.0,Zip_33896.0,Zip_40517.0,...,Zip_78681.0,Zip_79745.0,Zip_80237.0,Zip_89148.0,Zip_90033.0,Zip_91601.0,Zip_91803.0,Zip_95207.0,Zip_97224.0,Zip_98004.0
823362,6,13,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
278354,5,8,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
448970,3,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
252167,5,18,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
141042,3,20,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682809,5,12,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1031324,3,15,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
323019,0,12,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
355304,0,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
# Create and train a Random Forest Classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

import pickle
filename = 'rfc_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.33287970249002913


In [16]:
# from sklearn.ensemble import GradientBoostingClassifier
# # Initialize the Gradient Boosting Classifier
# gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)  # You can adjust the hyperparameters as needed
# gb_classifier.fit(X_train, y_train)

# import pickle
# filename = 'rfc_model.pkl'
# pickle.dump(gb_classifier, open(filename, 'wb'))

# # Make predictions on the test set
# y_pred = gb_classifier.predict(X_test)

# # Evaluate the model using accuracy score
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")

In [17]:
# Assuming you have a new data point in the form of a dictionary
new_data = {
    'Day1 of the Week': 'Tue',   # Example value
    'Time Integer': 'Afternoon',        # Example value
}


# Convert the new data into a DataFrame with the same structure as your training data
new_data_df = pd.DataFrame([new_data])

# Perform label encoding for categorical columns (Day of the Week and Time Bin)
new_data_df['Day1 of the Week'] = 1
new_data_df['Time Integer'] = 20

# Perform one-hot encoding for the Zip Code column
# new_data_df = pd.get_dummies(new_data_df, columns=['Zip Code'], prefix='Zip')



In [18]:
new_data_df

Unnamed: 0,Day1 of the Week,Time Integer
0,1,20


In [19]:
# Initialize a dictionary to hold the one-hot encoded values
encoded_values = {}
unique_zip_codes.sort()
# Create one-hot encoded columns for all unique Zip Codes seen in training data
for col in unique_zip_codes:
    encoded_values['Zip_' + str(col)] = [True if col == '75231.0' else False]


In [20]:
print(encoded_values)

{'Zip_0.0': [False], 'Zip_11576.0': [False], 'Zip_12400.0': [False], 'Zip_16066.0': [False], 'Zip_30305.0': [False], 'Zip_33455.0': [False], 'Zip_33896.0': [False], 'Zip_40517.0': [False], 'Zip_48232.0': [False], 'Zip_60612.0': [False], 'Zip_63145.0': [False], 'Zip_66210.0': [False], 'Zip_72220.0': [False], 'Zip_73237.0': [False], 'Zip_74204.0': [False], 'Zip_74243.0': [False], 'Zip_74701.0': [False], 'Zip_75001.0': [False], 'Zip_75006.0': [False], 'Zip_75007.0': [False], 'Zip_75014.0': [False], 'Zip_75016.0': [False], 'Zip_75019.0': [False], 'Zip_75021.0': [False], 'Zip_75023.0': [False], 'Zip_75024.0': [False], 'Zip_75025.0': [False], 'Zip_75026.0': [False], 'Zip_75032.0': [False], 'Zip_75033.0': [False], 'Zip_75034.0': [False], 'Zip_75035.0': [False], 'Zip_75039.0': [False], 'Zip_75040.0': [False], 'Zip_75041.0': [False], 'Zip_75042.0': [False], 'Zip_75043.0': [False], 'Zip_75044.0': [False], 'Zip_75048.0': [False], 'Zip_75050.0': [False], 'Zip_75051.0': [False], 'Zip_75052.0': [Fal

In [21]:
encoded_df = pd.DataFrame(encoded_values)
new_data_df = pd.concat([new_data_df, encoded_df], axis=1)

In [22]:
encoded_df

Unnamed: 0,Zip_0.0,Zip_11576.0,Zip_12400.0,Zip_16066.0,Zip_30305.0,Zip_33455.0,Zip_33896.0,Zip_40517.0,Zip_48232.0,Zip_60612.0,...,Zip_78681.0,Zip_79745.0,Zip_80237.0,Zip_89148.0,Zip_90033.0,Zip_91601.0,Zip_91803.0,Zip_95207.0,Zip_97224.0,Zip_98004.0
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
new_data_df

Unnamed: 0,Day1 of the Week,Time Integer,Zip_0.0,Zip_11576.0,Zip_12400.0,Zip_16066.0,Zip_30305.0,Zip_33455.0,Zip_33896.0,Zip_40517.0,...,Zip_78681.0,Zip_79745.0,Zip_80237.0,Zip_89148.0,Zip_90033.0,Zip_91601.0,Zip_91803.0,Zip_95207.0,Zip_97224.0,Zip_98004.0
0,1,20,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [24]:
# Use your trained model to predict the 'Safety Score' for the new data
predicted_safety_score = model.predict(new_data_df)

# Print the predicted 'Safety Score'
print(f"Predicted Safety Score: {predicted_safety_score[0]}")

Predicted Safety Score: Fairly Unsafe
