In [8]:
import metnum
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from time import time
from pprint import pprint
from Model import Model
from Segment import Segment
from NlpModel import NlpModel

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error as RMSE, mean_squared_log_error as RMSLE, balanced_accuracy_score as BAS, make_scorer
from sklearn.preprocessing import scale, normalize
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRegressor

In [2]:
df_original = pd.read_csv('../data/train.csv')
df = df_original.copy()

df['urbana'] = (df['escuelascercanas'] > 0) & (df['centroscomercialescercanos'] > 0 )

train_df, test_df = train_test_split(df, test_size=0.2, random_state=1)

In [3]:
# Dropeo las huertas porque solo hay una
train_df = train_df.drop(train_df[train_df['tipodepropiedad'] == 'Huerta'].index)
# Dropeo las quintas vacacionales porque solo los precios son cualquiera
train_df = train_df.drop(train_df[train_df['tipodepropiedad'] == 'Quinta Vacacional'].index)
# Dropeo los ranchos porque solo los precios son cualquiera
train_df = train_df.drop(train_df[train_df['tipodepropiedad'] == 'Rancho'].index)

In [4]:
predict_column = 'precio'
carititud_column = "carititud"
segments = ['urbana', 'provincia']
text_features = ['titulo', 'descripcion']
features = ['metrostotales', 'metroscubiertos', 'garages']

## Calculo de Hiper parametros
---

In [9]:
def sonCaras(precios):
    mean = precios.mean()
    min_val = precios.min()
    max_val = precios.max()
    return np.array([esCara(p, mean, max_val, min_val) for p in precios])
    
def esCara(precio, mean, max_val, min_val):
    if precio > mean:
        if precio > mean + (max_val - mean) / 2:
            return 3
        else:
            return 2
    else:
        if precio > min_val + (mean - min_val) / 2:
            return 1
        else:
            return 0
        
train_df[carititud_column] = sonCaras(train_df[predict_column].values)


## Buscando predecir el precio directamente

In [12]:
# A parameter grid for the pipeline
params = {
    # Bag of Words
    'count__min_df': [2, 5, 10],
    'count__max_features': [200, 500, 1000, 5000],

    # Descompositer
    'desc__n_components': [100, 500, 1000],

    # Classificator
    'reg__solver': ['sgd', 'adam'],
    'reg__max_iter': [50, 100],
    'reg__hidden_layer_sizes': [(100,), (50,20), (50)]  
}

folds = 3
param_comb = 1

# Uso KFold porque son valores continuos
kf = KFold(n_splits=folds, shuffle=True, random_state = 1001)

pipeline_reg = Pipeline([
    ('count', CountVectorizer()),
    ('desc', TruncatedSVD()),
    ('reg', MLPRegressor())
])

x_train = train_df[text_features + segments + features]
y_train = train_df[predict_column]

random_search_reg = RandomizedSearchCV(
    pipeline_reg, param_distributions=params, n_iter=param_comb, scoring='neg_mean_squared_error', 
    n_jobs=-1, cv=kf.split(x_train, y_train), random_state=1001, verbose=3
)

In [None]:
random_search_reg.fit(x_train[:100], y_train[:100])

In [None]:
estimator_reg = random_search_reg.best_estimator_

In [None]:
estimator_reg.score(x_test, y_test)

In [65]:
real = test_df[predict_column]
predicted = estimator_reg.predict(test_df[text_features + segments + features])
to_show = pd.DataFrame()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
to_show["real"] = real
to_show["pred"] = predicted
to_show

1.01E+00


Unnamed: 0,real,pred
0,0.539,1.072
1,0.214,0.951
2,0.330,0.940
3,0.097,0.945
4,0.309,1.242
...,...,...
1995,0.214,0.892
1996,0.283,1.070
1997,0.701,1.013
1998,0.840,1.083


## Probando con Clasificadores

In [28]:

# A parameter grid for the pipeline
params = {
    # Bag of Words
    'count__min_df': [2, 5, 10],
    'count__max_features': [200, 500, 1000, 5000],

    # Descompositer
    'desc__n_components': [100, 500, 1000],

    # Classificator
    'clf__min_child_weight': [1, 5, 10],
    'clf__gamma': [0.5, 1, 1.5, 2, 5],
    'clf__subsample': [0.6, 0.8, 1.0],
    'clf__colsample_bytree': [0.6, 0.8, 1.0],
    'clf__max_depth': [3, 5, 7, 10],
    'clf__learning_rate': [0.01, 0.02, 0.05]    
}

folds = 3
param_comb = 1

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)


xgb = XGBClassifier(learning_rate=0.02, n_estimators=1000, 
                    silent=True, nthread=6, tree_method='gpu_hist')

pipeline = Pipeline([
    ('count', CountVectorizer()),
    ('desc', TruncatedSVD()),
    ('clf', xgb)
])

random_search = RandomizedSearchCV(
    pipeline, param_distributions=params, n_iter=param_comb, scoring=make_scorer(BAS), 
    n_jobs=-1, cv=skf.split(x_train,y_train), random_state=1001
)


## Resultado 
best_params = {
    'clf__colsample_bytree': 0.6,
    'clf__gamma': 2,
    'clf__learning_rate': 0.05,
    'clf__max_depth': 7,
    'clf__min_child_weight': 10,
    'clf__subsample': 1.0,
    'count__max_features': 1000,
    'count__min_df': 5,
    'desc__n_components': 100
}

In [29]:
random_search.fit(x_train[:100], y_train[:100])

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7ff824009c50>,
                   estimator=Pipeline(steps=[('count', CountVectorizer()),
                                             ('tfid', TfidfTransformer()),
                                             ('clf',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            importance_type='gain',
                                                            interaction_constraints=None,


In [188]:
estimator = random_search.best_estimator_

In [190]:
estimator.score(x_test, y_test)

0.37900874635568516

In [114]:
real = y_test
predicted = estimator.predict(x_test)
to_show = pd.DataFrame()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
to_show["real"] = real
to_show["pred"] = predicted
to_show[to_show["pred"]==3]

1.92E+00


Unnamed: 0,real,pred
30578,1,3
