In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessed-dallas-public-safety/filtered_data_3.csv


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


In [3]:
data = pd.read_csv('/kaggle/input/preprocessed-dallas-public-safety/filtered_data_3.csv')

In [4]:
data = data.dropna()

In [5]:
unique_zip_codes = data['Zip Code'].unique()

In [6]:
# Perform label encoding for categorical columns (Day of the Week and Time Bin)
label_encoder = LabelEncoder()
data['Day1 of the Week'] = label_encoder.fit_transform(data['Day1 of the Week'])
data['Time Bin'] = label_encoder.fit_transform(data['Time Bin'])# Perform one-hot encoding for the Zip Code column
data = pd.get_dummies(data, columns=['Zip Code'], prefix='Zip')

In [7]:
data

Unnamed: 0,Day1 of the Week,Time1 of Occurrence,Time Bin,Division,Sector,Zip-Time-Percentage,Zipcode-Day-Percentage,Zipcode-Percentage,Incident_Score,Type of Incident,...,Zip_78681.0,Zip_79745.0,Zip_80237.0,Zip_89148.0,Zip_90033.0,Zip_91601.0,Zip_91803.0,Zip_95207.0,Zip_97224.0,Zip_98004.0
0,5,20:00,3,NORTHEAST,210.0,33.66,14.13,3.29,35.0,THEFT OF PROP (AUTO ACC) <$100 - (NOT EMP),...,False,False,False,False,False,False,False,False,False,False
1,0,12:00,0,NORTH CENTRAL,640.0,30.19,15.33,2.20,25.0,FALSE STATEMENT FOR PROPERTY/CREDIT $2500 < $30K,...,False,False,False,False,False,False,False,False,False,False
2,1,12:00,0,CENTRAL,150.0,29.00,14.17,3.37,65.0,CREDIT CARD OR DEBIT CARD ABUSE,...,False,False,False,False,False,False,False,False,False,False
3,4,1:38,1,NORTHWEST,520.0,17.14,14.23,0.88,10.0,ABANDONED PROPERTY (NO OFFENSE),...,False,False,False,False,False,False,False,False,False,False
4,1,17:30,0,SOUTHWEST,410.0,27.10,14.38,2.28,65.0,UNAUTHORIZED USE OF MOTOR VEH - AUTOMOBILE,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,3,19:20,3,SOUTHWEST,410.0,32.95,14.70,4.16,65.0,"THEFT OF PROP > OR EQUAL $750 BUT <$2,500 -SHO...",...,False,False,False,False,False,False,False,False,False,False
1048571,1,18:00,3,SOUTH CENTRAL,740.0,31.51,14.73,2.53,30.0,THEFT OF PROP <$100 - OTHER THAN SHOPLIFT,...,False,False,False,False,False,False,False,False,False,False
1048572,5,7:00,2,NORTH CENTRAL,610.0,19.22,13.90,2.34,40.0,RECKLESS DAMAGE,...,False,False,False,False,False,False,False,False,False,False
1048573,3,10:00,2,NORTHWEST,540.0,22.66,13.64,2.01,20.0,CRIMINAL TRESPASS WARNING,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Define the features and target variable
features = ['Day1 of the Week', 'Time Bin'] + [col for col in data.columns if col.startswith('Zip_')]
target = 'Safety Score'

# Define the ranges and corresponding labels for 'Incident_Score'
score_ranges = [(0, 10), (10, 30), (30, 60), (60, 80), (80, 100)]  # Wider score ranges
score_labels = ['Very Safe', 'Safe', 'Moderately Safe', 'Moderately Unsafe', 'Very Unsafe']


In [9]:
# Create a function to assign the label based on the score
def assign_safety_score(score):
    for i, (start, end) in enumerate(score_ranges):
        if start <= score < end:
            return score_labels[i]
    return None  # Handle values outside the defined ranges


In [10]:
# Apply the function to create the 'Safety Score' column
data['Safety Score'] = data['Incident_Score'].apply(assign_safety_score)

In [11]:
# Drop rows where 'Safety Score' is None (outside defined ranges)
data.dropna(subset=['Safety Score'], inplace=True)

In [12]:
# Split the dataset into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
X_train

Unnamed: 0,Day1 of the Week,Time Bin,Zip_0.0,Zip_11576.0,Zip_12400.0,Zip_16066.0,Zip_30305.0,Zip_33455.0,Zip_33896.0,Zip_40517.0,...,Zip_78681.0,Zip_79745.0,Zip_80237.0,Zip_89148.0,Zip_90033.0,Zip_91601.0,Zip_91803.0,Zip_95207.0,Zip_97224.0,Zip_98004.0
464142,3,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
976742,1,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
165314,6,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
940902,2,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
823347,5,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259976,0,3,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
366993,3,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
132283,6,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
673835,5,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
# Create and train a Random Forest Classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [15]:
import pickle
filename = 'rfc_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [16]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [17]:
# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.43483632463079663


In [18]:
# Assuming you have a new data point in the form of a dictionary
new_data = {
    'Day1 of the Week': 'Tue',   # Example value
    'Time Bin': 'Afternoon',        # Example value
}


# Convert the new data into a DataFrame with the same structure as your training data
new_data_df = pd.DataFrame([new_data])

# Perform label encoding for categorical columns (Day of the Week and Time Bin)
new_data_df['Day1 of the Week'] = 1
new_data_df['Time Bin'] = 1

# Perform one-hot encoding for the Zip Code column
# new_data_df = pd.get_dummies(new_data_df, columns=['Zip Code'], prefix='Zip')



In [19]:
new_data_df

Unnamed: 0,Day1 of the Week,Time Bin
0,1,1


In [20]:
# Initialize a dictionary to hold the one-hot encoded values
encoded_values = {}
unique_zip_codes.sort()
# Create one-hot encoded columns for all unique Zip Codes seen in training data
for col in unique_zip_codes:
    encoded_values['Zip_' + str(col)] = [True if col == '75231.0' else False]


In [21]:
print(encoded_values)

{'Zip_0.0': [False], 'Zip_11576.0': [False], 'Zip_12400.0': [False], 'Zip_16066.0': [False], 'Zip_30305.0': [False], 'Zip_33455.0': [False], 'Zip_33896.0': [False], 'Zip_40517.0': [False], 'Zip_48232.0': [False], 'Zip_60612.0': [False], 'Zip_63145.0': [False], 'Zip_66210.0': [False], 'Zip_72220.0': [False], 'Zip_73237.0': [False], 'Zip_74204.0': [False], 'Zip_74243.0': [False], 'Zip_74701.0': [False], 'Zip_75001.0': [False], 'Zip_75006.0': [False], 'Zip_75007.0': [False], 'Zip_75014.0': [False], 'Zip_75016.0': [False], 'Zip_75019.0': [False], 'Zip_75021.0': [False], 'Zip_75023.0': [False], 'Zip_75024.0': [False], 'Zip_75025.0': [False], 'Zip_75026.0': [False], 'Zip_75032.0': [False], 'Zip_75033.0': [False], 'Zip_75034.0': [False], 'Zip_75035.0': [False], 'Zip_75039.0': [False], 'Zip_75040.0': [False], 'Zip_75041.0': [False], 'Zip_75042.0': [False], 'Zip_75043.0': [False], 'Zip_75044.0': [False], 'Zip_75048.0': [False], 'Zip_75050.0': [False], 'Zip_75051.0': [False], 'Zip_75052.0': [Fal

In [22]:
encoded_df = pd.DataFrame(encoded_values)
new_data_df = pd.concat([new_data_df, encoded_df], axis=1)

In [23]:
encoded_df

Unnamed: 0,Zip_0.0,Zip_11576.0,Zip_12400.0,Zip_16066.0,Zip_30305.0,Zip_33455.0,Zip_33896.0,Zip_40517.0,Zip_48232.0,Zip_60612.0,...,Zip_78681.0,Zip_79745.0,Zip_80237.0,Zip_89148.0,Zip_90033.0,Zip_91601.0,Zip_91803.0,Zip_95207.0,Zip_97224.0,Zip_98004.0
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [24]:
new_data_df

Unnamed: 0,Day1 of the Week,Time Bin,Zip_0.0,Zip_11576.0,Zip_12400.0,Zip_16066.0,Zip_30305.0,Zip_33455.0,Zip_33896.0,Zip_40517.0,...,Zip_78681.0,Zip_79745.0,Zip_80237.0,Zip_89148.0,Zip_90033.0,Zip_91601.0,Zip_91803.0,Zip_95207.0,Zip_97224.0,Zip_98004.0
0,1,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [25]:
# Use your trained model to predict the 'Safety Score' for the new data
predicted_safety_score = model.predict(new_data_df)

# Print the predicted 'Safety Score'
print(f"Predicted Safety Score: {predicted_safety_score[0]}")

Predicted Safety Score: Moderately Unsafe
