In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle

# Load the dataset
data = pd.read_csv('crime__5.csv')

# Display the top 5 rows of the dataset
print(data.head())

# Remove null values
data = data.dropna()

# Preprocess date and location inputs
data['date_time'] = pd.to_datetime(data['date_time'])
data['date_time_year'] = pd.to_datetime(data['date_time']).dt.year
data['date_time_month'] = pd.to_datetime(data['date_time']).dt.month
data['date_time_day'] = pd.to_datetime(data['date_time']).dt.day
data['date_time_hour'] = pd.to_datetime(data['date_time']).dt.hour
data['date_time_minute'] = pd.to_datetime(data['date_time']).dt.minute
data['date_time_second'] = pd.to_datetime(data['date_time']).dt.second

# Select relevant columns for modeling
features = ['date_time_year', 'date_time_month', 'date_time_day', 'date_time_hour', 'date_time_minute',
            'date_time_second', 'latitude', 'longitude', 'city', 'state']
target = ['crime_name1', 'crime_name2', 'crime_name3']
data = data[features + target]

# Perform one-hot encoding for categorical variables
cat_cols = ['city', 'state']
enc = OneHotEncoder(drop='first', sparse=False)
encoded_cols = pd.DataFrame(enc.fit_transform(data[cat_cols]))
encoded_cols.columns = enc.get_feature_names(cat_cols)
data = pd.concat([data.drop(cat_cols, axis=1), encoded_cols], axis=1)

# Display the updated dataset
print(data.head())

# Split the data into training and testing sets
X = data.drop(target, axis=1)
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Apply preprocessing to the numeric features
numeric_features = ['date_time_year', 'date_time_month', 'date_time_day', 'date_time_hour', 'date_time_minute',
                    'date_time_second', 'latitude', 'longitude']
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)])

# Append classifier to preprocessing pipeline
knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', KNeighborsClassifier())])

# Fit the KNN model
knn_model.fit(X_train, y_train)

# Evaluate the KNN model on the test set
knn_pred = knn_model.predict(X_test)
from sklearn.metrics import precision_recall_fscore_support

# Evaluate the KNN model on the test set for each target variable
for i, target_var in enumerate(target):
    print(f'Classification Report for {target_var}:')
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test[target_var], knn_pred[:, i], average='weighted')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {fscore:.4f}')
    print()

   Unnamed: 0  incident_id            date_time nibrs_code  victims  \
0           0    201219928  2018-12-21 21:13:13        120        1   
1           1    201301798  2020-09-13 00:26:58        90Z        1   
2           4    201294935  2020-07-16 17:29:50        23C        1   
3           5    201302822  1900-01-01 00:00:00        11A        1   
4           6    201230089  2019-03-06 16:23:00        40A        1   

              crime_name1         crime_name2  \
0  Crime Against Property             Robbery   
1   Crime Against Society  All Other Offenses   
2  Crime Against Property         Shoplifting   
3    Crime Against Person       Forcible Rape   
4   Crime Against Society        Prostitution   

                                  crime_name3 police_district_name  \
0                        ROBBERY - STREET-GUN              WHEATON   
1  OBSTRUCT GOVT - VIOLATION OF A COURT ORDER           GERMANTOWN   
2                       LARCENY - SHOPLIFTING           GERMANTOWN  



   date_time_year  date_time_month  date_time_day  date_time_hour  \
0            2018               12             21              21   
1            2020                9             13               0   
2            2020                7             16              17   
3            1900                1              1               0   
4            2019                3              6              16   

   date_time_minute  date_time_second   latitude  longitude  \
0                13                13  39.036270 -77.049900   
1                26                58  39.277840 -77.211500   
2                29                50  39.198295 -77.244900   
3                 0                 0  39.168194 -77.175049   
4                23                 0  39.103443 -77.155941   

              crime_name1         crime_name2  ... city_WASHINGTON  \
0  Crime Against Property             Robbery  ...             0.0   
1   Crime Against Society  All Other Offenses  ...             0.0

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Classification Report for crime_name1:
Precision: 0.4071
Recall: 0.4571
F1-Score: 0.4173

Classification Report for crime_name2:


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.1350
Recall: 0.1875
F1-Score: 0.1397

Classification Report for crime_name3:
Precision: 0.0729
Recall: 0.0644
F1-Score: 0.0605



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle

# Load the dataset
data = pd.read_csv('crime__5.csv', usecols=['date_time', 'latitude', 'longitude', 'city', 'state', 'crime_name2'])

data.info()

# Load only 100,000 rows
data = data.sample(n=300000, random_state=42)

# Preprocess date inputs
data['date_time'] = pd.to_datetime(data['date_time'])
data['year'] = data['date_time'].dt.year
data['month'] = data['date_time'].dt.month
data['day'] = data['date_time'].dt.day

# Perform one-hot encoding for categorical variables
cat_cols = ['city', 'state']
enc = OneHotEncoder(drop='first', sparse=False)
encoded_cols = pd.DataFrame(enc.fit_transform(data[cat_cols]))
encoded_cols.columns = enc.get_feature_names(cat_cols)
data.drop(cat_cols, axis=1, inplace=True)
data = pd.concat([data, encoded_cols], axis=1)

# Remove rows with NaN values
data.dropna(subset=['crime_name2'], inplace=True)

# Split the data into training and testing sets
X = data[['year', 'month', 'day','latitude','longitude'] + list(encoded_cols.columns)]
y = data['crime_name2']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Apply preprocessing to the numeric features
numeric_features = ['year', 'month', 'day']
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)])

# Append classifier to preprocessing pipeline for KNN
knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', KNeighborsClassifier())])

# Fit the KNN model
knn_model.fit(X_train, y_train)

# Evaluate the KNN model on the test set
knn_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print('KNN Classification Report:')
print(classification_report(y_test, knn_pred))
print('KNN Accuracy:', knn_accuracy)

# Export the KNN model
pickle.dump(knn_model, open('knn_model_crime_name2.pkl', 'wb'))

# Append classifier to preprocessing pipeline for Random Forest
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

# Fit the Random Forest model
rf_model.fit(X_train, y_train)

# Evaluate the Random Forest model on the test set
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print('Random Forest Classification Report:')
print(classification_report(y_test, rf_pred))
print('Random Forest Accuracy:', rf_accuracy)

# Export the Random Forest model
pickle.dump(rf_model, open('random_forest_model_crime_name2.pkl', 'wb'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303743 entries, 0 to 303742
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   date_time    303743 non-null  object 
 1   crime_name2  303743 non-null  object 
 2   city         303743 non-null  object 
 3   state        303743 non-null  object 
 4   latitude     303743 non-null  float64
 5   longitude    303743 non-null  float64
dtypes: float64(2), object(4)
memory usage: 13.9+ MB


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                             precision    recall  f1-score   support

                         Aggravated Assault       0.02      0.04      0.02       893
                         All Other Offenses       0.25      0.67      0.36     14886
                          All other Larceny       0.05      0.04      0.05      2958
                                      Arson       0.00      0.00      0.00        76
        Assisting or Promoting Prostitution       0.00      0.00      0.00         3
                                 Bad Checks       0.00      0.00      0.00        72
             Burglary/Breaking and Entering       0.03      0.02      0.02      1825
                     Counterfeiting/Forgery       0.02      0.01      0.01       633
 Credit Card/Automatic Teller Machine Fraud       0.02      0.01      0.01      1063
       Curfew/Loitering/Vagrancy Violations       0.00      0.00      0.00         6
   Destruction/Damage/Vandalism of Property       0.08      0.05

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                             precision    recall  f1-score   support

                         Aggravated Assault       0.00      0.00      0.00       893
                         All Other Offenses       0.25      0.98      0.40     14886
                          All other Larceny       0.00      0.00      0.00      2958
                                      Arson       0.00      0.00      0.00        76
        Assisting or Promoting Prostitution       0.00      0.00      0.00         3
                                 Bad Checks       0.00      0.00      0.00        72
             Burglary/Breaking and Entering       0.13      0.00      0.01      1825
                     Counterfeiting/Forgery       0.00      0.00      0.00       633
 Credit Card/Automatic Teller Machine Fraud       0.00      0.00      0.00      1063
       Curfew/Loitering/Vagrancy Violations       0.00      0.00      0.00         6
   Destruction/Damage/Vandalism of Property       0.14      0.01

In [15]:
with open('one_hot_encoder.pkl', 'wb') as file:
    pickle.dump(enc, file)

In [6]:
import pandas as pd
import pickle

# Load the saved KNN model
rf_model = pickle.load(open('knn_model.pkl', 'rb'))

# Load the saved OneHotEncoder
enc = pickle.load(open('one_hot_encoder.pkl', 'rb'))

# Create a sample input from the user
sample_input = pd.DataFrame({
    'date_time': ['2022-05-14 12:30:00'],
    'city': ['ROCKVILLE'],
    'state': ['MD']
})

# Preprocess the sample input
sample_input['date_time'] = pd.to_datetime(sample_input['date_time'])
sample_input['year'] = sample_input['date_time'].dt.year
sample_input['month'] = sample_input['date_time'].dt.month
sample_input['day'] = sample_input['date_time'].dt.day

# Perform one-hot encoding for categorical variables
cat_cols = ['city', 'state']
encoded_cols = pd.DataFrame(enc.transform(sample_input[cat_cols]))
encoded_cols.columns = enc.get_feature_names(cat_cols)

# Handle unknown categories dynamically
missing_categories = set(sample_input[cat_cols].values.flatten()) - set(enc.categories_[0])
if missing_categories:
    for category in missing_categories:
        encoded_cols[category] = 0

sample_input.drop(cat_cols, axis=1, inplace=True)
sample_input = pd.concat([sample_input, encoded_cols], axis=1)

# Make predictions using the KNN model
rf_prediction = rf_model.predict(sample_input)
print('Random Forest Predictions:', rf_prediction)

Random Forest Predictions: ['All Other Offenses']


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
