In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle

# Load the dataset
data = pd.read_csv('crime__5.csv', usecols=['date_time', 'latitude', 'longitude', 'city', 'state', 'crime_name2'])

data = data.sample(n=10000, random_state=42)

# Preprocess date and location inputs
data['date_time'] = pd.to_datetime(data['date_time'])
data['year'] = data['date_time'].dt.year
data['month'] = data['date_time'].dt.month
data['day'] = data['date_time'].dt.day
data['hour'] = data['date_time'].dt.hour
data['minute'] = data['date_time'].dt.minute
data['second'] = data['date_time'].dt.second

# Perform one-hot encoding for categorical variables
cat_cols = ['city', 'state']
enc = OneHotEncoder(drop='first', sparse=False)
encoded_cols = pd.DataFrame(enc.fit_transform(data[cat_cols]))
encoded_cols.columns = enc.get_feature_names(cat_cols)
data.drop(cat_cols, axis=1, inplace=True)
data = pd.concat([data, encoded_cols], axis=1)

# Remove rows with NaN values
data.dropna(subset=['crime_name2'], inplace=True)

# Split the data into training and testing sets
X = data.drop('crime_name2', axis=1)
y = data['crime_name2']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Apply preprocessing to the numeric features
numeric_features = ['year', 'month', 'day', 'hour', 'minute', 'second', 'latitude', 'longitude']
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)])

# Append classifier to preprocessing pipeline for KNN
knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', KNeighborsClassifier())])

# Fit the KNN model
knn_model.fit(X_train, y_train)

# Evaluate the KNN model on the test set
knn_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print('KNN Classification Report:')
print(classification_report(y_test, knn_pred))
print('KNN Accuracy:', knn_accuracy)

# Export the KNN model
pickle.dump(knn_model, open('knn_model_crime_name2.pkl', 'wb'))

# Append classifier to preprocessing pipeline for Random Forest
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

# Fit the Random Forest model
rf_model.fit(X_train, y_train)

# Evaluate the Random Forest model on the test set
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print('Random Forest Classification Report:')
print(classification_report(y_test, rf_pred))
print('Random Forest Accuracy:', rf_accuracy)

# Export the Random Forest model
pickle.dump(rf_model, open('random_forest_model_crime_name2.pkl','wb'))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  _warn_prf(average, modifier, msg_start, len(result))


KNN Classification Report:
                                             precision    recall  f1-score   support

                         Aggravated Assault       0.05      0.12      0.07        25
                         All Other Offenses       0.25      0.59      0.35       509
                          All other Larceny       0.06      0.05      0.05        98
                                      Arson       0.00      0.00      0.00         3
             Burglary/Breaking and Entering       0.02      0.01      0.02        67
                     Counterfeiting/Forgery       0.00      0.00      0.00        21
 Credit Card/Automatic Teller Machine Fraud       0.00      0.00      0.00        37
   Destruction/Damage/Vandalism of Property       0.06      0.05      0.06       121
                         Disorderly Conduct       0.05      0.04      0.04        28
                Driving Under the Influence       0.14      0.07      0.09        46
                  Drug Equipment Viol

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Classification Report:
                                             precision    recall  f1-score   support

                         Aggravated Assault       0.11      0.04      0.06        25
                         All Other Offenses       0.29      0.80      0.42       509
                          All other Larceny       0.07      0.02      0.03        98
                                      Arson       0.00      0.00      0.00         3
                                 Bad Checks       0.00      0.00      0.00         0
             Burglary/Breaking and Entering       0.12      0.03      0.05        67
                     Counterfeiting/Forgery       0.00      0.00      0.00        21
 Credit Card/Automatic Teller Machine Fraud       0.00      0.00      0.00        37
   Destruction/Damage/Vandalism of Property       0.03      0.02      0.02       121
                         Disorderly Conduct       0.12      0.04      0.06        28
                Driving Und

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
