In [1]:
"""
  Sofía Almeida Bruno
  http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime "
"""

import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
import time
#plt.style.use('seaborn')

#-------------------------------------------------------------------------------

# Ajuste de parámetros
def adjust_params(X, y, model, params):
    print("------ Grid Search...")
    grid = GridSearchCV(model, params, cv=5, n_jobs=2, verbose=1, scoring='neg_mean_squared_error')
    grid.fit(X, y)
    print("Mejores parámetros:")
    print(grid.best_params_)
    print("Error CV")
    print(-grid.best_score_)
    return grid.best_estimator_
#------------------------------------------------------------------

#-------------------------------------------------------------------------------

# Leemos los nombres de los atributos
attributes = np.genfromtxt('./datos/communities.names', dtype = "|U50", skip_header = 75, max_rows = 128, delimiter = " ")[:,1]

# Leemos el conjunto de datos
df = pd.read_csv('./datos/communities.data', sep=",", na_values ='?', names = attributes)
df.name = 'Communities and Crimes'

print("Tamaño: ", df.shape)

# Dividimos en training y test
X =  df.drop(labels = ['ViolentCrimesPerPop'], axis = 1)
y = df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123456)

print("Tamaño training: ", X_train.shape, y_train.shape)
print("Tamaño test: ", X_test.shape, y_test.shape)

Tamaño:  (1994, 128)
Tamaño training:  (1495, 127) (1495,)
Tamaño test:  (499, 127) (499,)


In [2]:
# Preprocesado----------------------------------------------------------

# Eliminamos las variables no predictivas
to_drop = np.array(['state', 'county', 'community', 'communityname', 'fold'])
X_train = X_train.drop(labels = to_drop, axis=1)
print("Tamaño train tras drop (5): ", X_train.shape)

mv_sum = X_train.isnull().sum()
mv = mv_sum * 100 / len(X_train)

# Eliminamos variables con más de un 30% de valores perdidos
for column in X_train:
    if mv[column] > 30.0:
        X_train.drop(labels=[column], axis=1, inplace = True)
        to_drop = np.append(to_drop, column)
print("Tamaño train tras drop (variables con mv)", X_train.shape)

# Imputamos los valores perdidos de aquellas variables que tengan menos de un 30% de valores perdidos
imputer = KNNImputer()

X = imputer.fit_transform(X_train)
# Eliminamos variables con varianza muy baja
var = VarianceThreshold(threshold=(.95 * (1 - .95)))


# Añadimos complejidad al modelo
poly = PolynomialFeatures(2)
X = poly.fit_transform(X)
print("Tras poli", X.shape)
X = var.fit_transform(X)
print("Tras var", X.shape)
# Reducimos mediante regularización lasso
#lasso = Lasso(max_iter = 100000, alpha = 0.01)
lasso = LassoCV(n_jobs = -1, max_iter = 50000, verbose = True, cv = 4)

preprocessing = Pipeline(steps=[
    ('imputer', imputer),
#    ('scale',StandardScaler()),
    ('poly', poly),
 #   ('Variance', var),
    ('lasso', SelectFromModel(lasso))])

X_train = preprocessing.fit_transform(X_train, y_train)
print(X_train.shape)

Tamaño train tras drop (5):  (1495, 122)
Tamaño train tras drop (variables con mv) (1495, 100)
Tras poli (1495, 5151)
Tras var (1495, 150)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 10.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 10.7min finished


(1495, 88)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [3]:
lreg =  SGDRegressor(tol=1e-4)
lreg.fit(X_train, y_train)
params_lreg =  {'alpha':[1/(10.0**i) for i in range(1,5)], 'learning_rate':['constant', 'optimal','invscaling', 'adaptive'], 'max_iter':[5000,10000,15000]}


best_lreg = adjust_params(X_train, y_train, lreg, params_lreg)

------ Grid Search...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.4min


Mejores parámetros:
{'alpha': 0.0001, 'learning_rate': 'adaptive', 'max_iter': 5000}
0.014417125201410069


[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:  4.5min finished


In [4]:
# Preprocesamos el conjunto de test 
X_test = X_test.drop(labels = to_drop, axis=1)
print(X_test.shape)
X_test = preprocessing.transform(X_test)

(499, 100)


In [5]:
X_test.shape

(499, 88)

In [6]:
# Utilizando el mejor modelo predecimos el valor de X_test
y_pred = best_lreg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Error cuadrático medio en test: {:.4f}".format(mse))

Error cuadrático medio en test: 0.0234
