Q: Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis



df = pd.read_csv("Boston.csv")

df["crim"] = pd.factorize(df["crim"] > df["crim"].median())[0]

df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [32]:
#Logistical Regresion

cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)

X = df.drop(columns=['crim'])
y = df['crim']

logreg_model = LogisticRegression(fit_intercept=True, max_iter=4000)

scores = []
for train_idx, test_idx in cv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    logreg_model.fit(X_train, y_train)
    score = logreg_model.score(X_test, y_test)
    scores.append(score)

mean_score = np.mean(scores)

print(mean_score)


0.8414666666666666


In [31]:
#LDA

cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=2)

lda_model = LinearDiscriminantAnalysis()

scores = []
for train_idx, test_idx in cv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    lda_model.fit(X_train, y_train)
    score = lda_model.score(X_test, y_test)
    scores.append(score)

mean_score = np.mean(scores)

print(mean_score)


0.8477960784313727


In [44]:
#KNN

cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=4)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

param_grid = {"n_neighbors": np.arange(1, 21, 2)}

knn_model = KNeighborsRegressor()

grid_search = GridSearchCV(knn_model, param_grid, cv=cv)
grid_search.fit(X_scaled, y)

best_score = grid_search.best_score_

print(best_score)

0.7337144053676491


#A: The best model is the LDA, although with a very tight margin with the logistic regression, in turn the worst is the KNN, very probably due to its simplicity, this could possibly improve if we filtered from the best predictor to the worst