In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import sqlite3

# Connect to the database
conn = sqlite3.connect('chicago-crime-property.db')

# Read data from the database into a DataFrame
query = 'SELECT * FROM property_with_crime'
property_with_crime_df = pd.read_sql(query, conn)

# Close the database connection
conn.close()

# Assuming X and y are your feature matrix and target variable
X = property_with_crime_df[['beds', 'sqft', 'year_built', 'days_on_mls', 'latitude', 'longitude', 'sold_price', 'community', 'neighborhood']]
y = property_with_crime_df['crime_count']

# Separate numerical and categorical features
numerical_features = ['beds', 'sqft', 'year_built', 'days_on_mls', 'latitude', 'longitude', 'sold_price']
categorical_features = ['community', 'neighborhood']

# Create transformers for numerical and categorical features
numerical_transformer = 'passthrough'
categorical_transformer = OneHotEncoder(drop='first')

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing to the features
X_transformed = preprocessor.fit_transform(X)

# Calculate mutual information between features and target for a classifier
mutual_info = mutual_info_classif(X_transformed, y)

# Create a DataFrame to store feature names and their mutual information scores
feature_info_df = pd.DataFrame({'Feature': preprocessor.get_feature_names_out(), 'Mutual_Information': mutual_info})

# Sort features by mutual information score
feature_info_df = feature_info_df.sort_values(by='Mutual_Information', ascending=False)

# Print the ranked features
print(feature_info_df)


In [2]:
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import numpy as np

# Connect to the database
conn = sqlite3.connect('chicago-crime-property.db')

# Read data from the database into a DataFrame
query = 'SELECT * FROM property_with_crime'
property_with_crime_df = pd.read_sql(query, conn)

# Close the database connection
conn.close()

# Replace -1 with NaN in the 'year_built' column
property_with_crime_df['year_built'].replace(-1, pd.NA, inplace=True)

# Convert the entire DataFrame to numeric
property_with_crime_df = property_with_crime_df.apply(pd.to_numeric, errors='ignore')

# Assuming X and y are your feature matrix and target variable
X = property_with_crime_df[['longitude', 'latitude', 'sold_price', 'year_built']]
y = property_with_crime_df['crime_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Impute missing values in the 'year_built' column
imputer = SimpleImputer(strategy='mean')
X_train_scaled[:, 3:4] = imputer.fit_transform(X_train_scaled[:, 3:4])
X_test_scaled[:, 3:4] = imputer.transform(X_test_scaled[:, 3:4])

# Initialize Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the model
nb_classifier.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_nb = nb_classifier.predict(X_test_scaled)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Accuracy: {accuracy}')

# Print classification report
print('Classification Report:')
print(classification_report(y_test, y_pred_nb))

# Evaluate the performance on the training set
y_pred_train = nb_classifier.predict(X_train_scaled)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f'Training Accuracy: {accuracy_train}')

# Print classification report for training set
print('Training Classification Report:')
print(classification_report(y_train, y_pred_train))

# Evaluate the performance on the test set
accuracy_test = accuracy_score(y_test, y_pred_nb)
print(f'Test Accuracy: {accuracy_test}')

# Print classification report for test set
print('Test Classification Report:')
print(classification_report(y_test, y_pred_nb))

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Assuming y_test and y_pred_final are defined
mape = mean_absolute_percentage_error(y_test, y_pred_nb)
print(f'Mean Absolute Percentage Error (MAPE): {mape:.2f}%')


import joblib

# Save the Gaussian Naive Bayes model
joblib.dump(nb_classifier, 'nb_model.sav')

print("Model has been saved as 'nb_model.sav'")


Accuracy: 0.8915831791206688
Classification Report:
              precision    recall  f1-score   support

         262       0.98      0.94      0.96       334
         357       0.98      1.00      0.99        49
         494       0.99      0.99      0.99       479
         519       0.94      0.83      0.88       466
         678       0.86      0.93      0.89        58
         707       0.73      0.55      0.63       242
         764       0.95      0.91      0.93       208
         921       0.87      0.89      0.88       300
         966       0.81      0.98      0.89        88
         989       0.94      0.97      0.96       230
         995       0.98      0.88      0.93       500
        1015       0.90      0.97      0.94        98
        1022       0.96      0.85      0.90       170
        1031       0.72      0.98      0.83       273
        1109       0.74      0.96      0.84       222
        1133       0.73      0.90      0.80       576
        1151       0.77      

In [1]:
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
# Connect to the database
conn = sqlite3.connect('chicago-crime-property.db')

# Read data from the database into a DataFrame
query = 'SELECT * FROM property_with_crime'
property_with_crime_df = pd.read_sql(query, conn)

# Close the database connection
conn.close()

# Replace -1 with NaN in the 'year_built' column
property_with_crime_df['year_built'].replace(-1, pd.NA, inplace=True)

# One-hot encode the 'neighborhood' column
X = pd.get_dummies(property_with_crime_df[['longitude', 'latitude', 'sold_price', 'year_built', 'neighborhood']])
y = property_with_crime_df['crime_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Impute missing values in the 'year_built' column
imputer = SimpleImputer(strategy='mean')
X_train_scaled[:, 3:4] = imputer.fit_transform(X_train_scaled[:, 3:4])
X_test_scaled[:, 3:4] = imputer.transform(X_test_scaled[:, 3:4])

# Initialize Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the model
nb_classifier.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_nb = nb_classifier.predict(X_test_scaled)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Accuracy: {accuracy}')

# Print classification report
print('Classification Report:')
print(classification_report(y_test, y_pred_nb))

# Evaluate the performance on the training set
y_pred_train = nb_classifier.predict(X_train_scaled)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f'Training Accuracy: {accuracy_train}')

# Print classification report for training set
print('Training Classification Report:')
print(classification_report(y_train, y_pred_train))

# Evaluate the performance on the test set
accuracy_test = accuracy_score(y_test, y_pred_nb)
print(f'Test Accuracy: {accuracy_test}')

# Print classification report for test set
print('Test Classification Report:')
print(classification_report(y_test, y_pred_nb))

final_mse = mean_squared_error(y_test, y_pred_nb)
print(f'Final Mean Squared Error: {final_mse}')

r2 = r2_score(y_test, y_pred_nb)
print(f'R-squared Score: {r2}')

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_nb))
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')


def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Assuming y_test and y_pred_final are defined
mape = mean_absolute_percentage_error(y_test, y_pred_nb)
print(f'Mean Absolute Percentage Error (MAPE): {mape:.2f}%')

import joblib

# Save the Gaussian Naive Bayes model
joblib.dump(nb_classifier, 'nb_modelneb.sav')

print("Model has been saved as 'nb_modelneb.sav'")


Accuracy: 0.9953736200625359
Classification Report:
              precision    recall  f1-score   support

         262       1.00      0.99      1.00       334
         357       1.00      1.00      1.00        49
         494       1.00      1.00      1.00       479
         519       1.00      0.99      1.00       466
         678       1.00      0.98      0.99        58
         707       1.00      0.99      0.99       242
         764       1.00      1.00      1.00       208
         921       1.00      0.99      0.99       300
         966       1.00      0.99      0.99        88
         989       1.00      0.99      0.99       230
         995       1.00      0.99      1.00       500
        1015       1.00      0.99      0.99        98
        1022       1.00      0.99      0.99       170
        1031       1.00      0.99      0.99       273
        1109       1.00      0.98      0.99       222
        1133       1.00      0.99      1.00       576
        1151       1.00      

In [5]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate the average value of y_train
y_train_avg = np.mean(y_train)

# Create a prediction set with the average values
y_pred_baseline = np.full_like(y_train, y_train_avg)

# Calculate metrics for the baseline model
mse_baseline = mean_squared_error(y_train, y_pred_baseline)
rmse_baseline = np.sqrt(mse_baseline)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Baseline Model Metrics:")
print(f"Mean Squared Error (MSE): {mse_baseline}")
print(f"Root Mean Squared Error (RMSE): {rmse_baseline}")
print(f"Mean Absolute Error (MAE): {mae_baseline}")


Baseline Model Metrics:
Mean Squared Error (MSE): 10914079.244880052
Root Mean Squared Error (RMSE): 3303.646355904344
Mean Absolute Error (MAE): 2779.0160408399306
