# Experimento

In [19]:
#importing libraries
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier #kNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
# from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

In [10]:
casearrest = pd.read_csv('../data/cleaned/casearrest_cl.csv', index_col=0)
jailhistory = pd.read_csv('../data/cleaned/jailhistory_cl.csv', index_col=0)
people = pd.read_csv('../data/cleaned/people_cl.csv', index_col=0)
prisonhistory = pd.read_csv('../data/cleaned/prisonhistory_cl.csv', index_col=0)
charge = pd.read_csv('../data/cleaned/charge_cl.csv', index_col=0)

In [11]:

def get_best_features(df, target):
    n_of_features = np.arange(1, df.shape[1]+1)
    high_score = 0
    nof = 0
    score_list = []
    for n in n_of_features:
        X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = 0.3, random_state = 0)
        model = LinearRegression()
        rfe = RFE(model, n_features_to_select=n)
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if(score>high_score):
            high_score = score
            nof = n
    return nof, high_score, score_list

# Caso `casearrest`

In [12]:
import seaborn as sns



casearrest_X = casearrest.drop(['decile_score'], axis='columns')
casearrest_y = casearrest['decile_score']
nof, high_score, score_list = get_best_features(casearrest_X, casearrest_y)
print('Casearrest:')
print('N of features:', nof)
print('High score:', high_score)
print('Score list:', score_list)


# plt.figure(figsize=(12,10))
# cor = casearrest_X.corr()
# sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
# plt.show()
# son todas, asi que da igual

Casearrest:
N of features: 3
High score: 0.012440693217863563
Score list: [0.008050461031538392, 0.012089443432227287, 0.012440693217863563]


In [13]:
jailhistory_X = jailhistory.drop(['decile_score'], axis='columns')
jailhistory_y = jailhistory['decile_score']
nof, high_score, score_list = get_best_features(jailhistory_X, jailhistory_y)
print('jailhistory:')
print('N of features:', nof)
print('High score:', high_score)
print('Score list:', score_list)

# son todoas, asi que da igual


jailhistory:
N of features: 3
High score: 0.16073495854007736
Score list: [0.01205922038852969, 0.037284843185304783, 0.16073495854007736]


In [14]:
people_filter = people[people.columns[0:-9]].drop(columns=['age'])
people_filter.head()
df_race = pd.get_dummies(people_filter['race'], prefix='race', prefix_sep='_')
df_sex = pd.get_dummies(people_filter['sex'], prefix='sex',
                        prefix_sep='_', drop_first=True)

people_filter = pd.concat([people_filter, df_race, df_sex], axis=1).drop('sex', axis=1).drop('race', axis=1)
people_filter.head()
people_X = people_filter.drop(['decile_score'], axis='columns')
people_y = people_filter['decile_score']
nof, high_score, score_list = get_best_features(people_X, people_y)
print('people:')
print('N of features:', nof)
print('High score:', high_score)
print('Score list:', score_list)



people:
N of features: 20
High score: 0.4482377264724221
Score list: [-0.002203948720679083, 0.05160922465324713, 0.05484143558045085, 0.05655485210824196, 0.19831525083241386, 0.21585304056185683, 0.25488964016135207, 0.2955923588089835, 0.29608313107840334, 0.296107486768994, 0.30004315375235524, 0.3151062780841979, 0.315586353006027, 0.3230720009696194, 0.3227224393952839, 0.32242680988628114, 0.44768425608525864, 0.4476842560859339, 0.4480347641424699, 0.4482377264724221]


In [15]:
prisonhistory_X = prisonhistory.drop(['decile_score'], axis='columns')
prisonhistory_y = prisonhistory['decile_score']
nof, high_score, score_list = get_best_features(prisonhistory_X, prisonhistory_y)
print('prisonhistory:')
print('N of features:', nof)
print('High score:', high_score)
print('Score list:', score_list)

prisonhistory:
N of features: 3
High score: 0.0885879841366789
Score list: [-0.006326693289959051, -0.008560941503100494, 0.0885879841366789]


In [16]:
charge_X = charge.drop(['decile_score'], axis='columns')
charge_y = charge['decile_score']
nof, high_score, score_list = get_best_features(charge_X, charge_y)
print('charge:')
print('N of features:', nof)
print('High score:', high_score)
print('Score list:', score_list)

charge:
N of features: 5
High score: 0.05176000243535828
Score list: [0.012744096442611985, 0.012840579721098533, 0.04289828847293886, 0.05066701750574043, 0.05176000243535828]


In [33]:
# Parámetros a utilizar en GridSearch
tuned_parameters = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                    'weights': ['uniform','distance']}
score = 'f1'

###separamos valores
data_train, data_test, y_train, y_test = train_test_split(people_filter, people_filter['decile_score'], test_size=.30,
                                                    random_state=15, stratify=people_filter['decile_score'])


### dejamos los datos para trabajar
y_test = data_test.decile_score
X_test = data_test.drop(columns=['decile_score'])
y_orig = data_train.decile_score
X_orig = data_train.drop(columns=['decile_score'])


### modelo del original
clf_org = GridSearchCV(KNeighborsClassifier(),
                       param_grid=tuned_parameters,
                       cv=5,
                       scoring="f1_macro")
print(" ORIG Mejor combinación de parámetros:")
clf_org.fit(X_orig,y_orig)
print(clf_org.best_params_)

y_pred = clf_org.predict(X_test)

print(classification_report(y_test, y_pred))

 ORIG Mejor combinación de parámetros:
{'n_neighbors': 7, 'weights': 'distance'}
              precision    recall  f1-score   support

           1       0.49      0.59      0.54       709
           2       0.24      0.26      0.25       442
           3       0.17      0.16      0.17       348
           4       0.16      0.17      0.17       337
           5       0.13      0.12      0.12       294
           6       0.12      0.10      0.11       282
           7       0.12      0.11      0.11       259
           8       0.10      0.09      0.09       227
           9       0.14      0.12      0.13       228
          10       0.15      0.13      0.14       173

    accuracy                           0.24      3299
   macro avg       0.18      0.18      0.18      3299
weighted avg       0.23      0.24      0.24      3299



In [35]:
tuned_parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                    'criterion': ['gini','entropy']}

In [36]:
clf_org = GridSearchCV(DecisionTreeClassifier(),
                       param_grid=tuned_parameters,
                       cv=5,
                       scoring="f1_macro")
print(" ORIG Mejor combinación de parámetros:")
clf_org.fit(X_orig,y_orig)
print(clf_org.best_params_)

y_pred = clf_org.predict(X_test)

print(classification_report(y_test, y_pred))

 ORIG Mejor combinación de parámetros:
{'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 8, 'min_samples_split': 12}
              precision    recall  f1-score   support

           1       0.54      0.81      0.65       709
           2       0.25      0.35      0.29       442
           3       0.20      0.12      0.15       348
           4       0.19      0.20      0.19       337
           5       0.15      0.14      0.14       294
           6       0.13      0.09      0.11       282
           7       0.16      0.11      0.13       259
           8       0.18      0.13      0.15       227
           9       0.20      0.12      0.15       228
          10       0.25      0.16      0.20       173

    accuracy                           0.31      3299
   macro avg       0.23      0.22      0.22      3299
weighted avg       0.27      0.31      0.28      3299



In [48]:
data_train = data_train.reset_index(drop=True)
data_subsampled = data_train[data_train.decile_score == 10]
for i in range(1,10):
    idx = np.random.choice(data_train.loc[data_train['decile_score'] == i].index, size=404, replace=False)
    data_subsampled = pd.concat([data_subsampled,  data_train.iloc[idx]])

In [49]:
# Parámetros a utilizar en GridSearch
tuned_parameters = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                    'weights': ['uniform','distance']}

### dejamos los datos para trabajar
y_test = data_test.decile_score
X_test = data_test.drop(columns=['decile_score'])
y_sub = data_subsampled.decile_score
X_sub = data_subsampled.drop(columns=['decile_score'])


### modelo del original
clf_sub = GridSearchCV(KNeighborsClassifier(),
                       param_grid=tuned_parameters,
                       cv=5,
                       scoring="f1_macro")
print(" SUB Mejor combinación de parámetros:")
clf_sub.fit(X_sub,y_sub)
print(clf_sub.best_params_)

y_pred = clf_sub.predict(X_test)

print(classification_report(y_test, y_pred))

 SUB Mejor combinación de parámetros:
{'n_neighbors': 9, 'weights': 'distance'}
              precision    recall  f1-score   support

           1       0.57      0.41      0.48       709
           2       0.20      0.20      0.20       442
           3       0.14      0.15      0.15       348
           4       0.14      0.15      0.15       337
           5       0.12      0.12      0.12       294
           6       0.12      0.13      0.13       282
           7       0.11      0.11      0.11       259
           8       0.09      0.11      0.10       227
           9       0.14      0.14      0.14       228
          10       0.15      0.21      0.18       173

    accuracy                           0.21      3299
   macro avg       0.18      0.17      0.17      3299
weighted avg       0.23      0.21      0.22      3299



In [50]:
tuned_parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                    'criterion': ['gini','entropy']}

### dejamos los datos para trabajar
y_test = data_test.decile_score
X_test = data_test.drop(columns=['decile_score'])
y_sub = data_subsampled.decile_score
X_sub = data_subsampled.drop(columns=['decile_score'])


### modelo del original
clf_sub = GridSearchCV(DecisionTreeClassifier(),
                       param_grid=tuned_parameters,
                       cv=5,
                       scoring="f1_macro")
print(" SUB Mejor combinación de parámetros:")
clf_sub.fit(X_sub,y_sub)
print(clf_sub.best_params_)

y_pred = clf_sub.predict(X_test)

print(classification_report(y_test, y_pred))

 SUB Mejor combinación de parámetros:
{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 8, 'min_samples_split': 7}
              precision    recall  f1-score   support

           1       0.61      0.60      0.61       709
           2       0.24      0.32      0.28       442
           3       0.16      0.16      0.16       348
           4       0.20      0.20      0.20       337
           5       0.12      0.07      0.09       294
           6       0.13      0.11      0.12       282
           7       0.10      0.12      0.11       259
           8       0.11      0.12      0.12       227
           9       0.17      0.13      0.15       228
          10       0.18      0.28      0.22       173

    accuracy                           0.26      3299
   macro avg       0.21      0.21      0.20      3299
weighted avg       0.26      0.26      0.26      3299

