In [1]:
import metnum
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from time import time
from pprint import pprint
from Model import Model
from Segment import Segment
# from NlpModel import NlpModel

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error as RMSE, mean_squared_log_error as RMSLE, balanced_accuracy_score as BAS, make_scorer
from sklearn.preprocessing import scale, normalize
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRegressor

In [2]:
train_df_original = pd.read_csv('../data/train.csv')
train_df = train_df_original.copy()
# train_df.info()
test_df_original = pd.read_csv('../data/test.csv')
test_df = test_df_original.copy()
# test_df.info()

In [3]:
import pandas as pd
import numpy as np

from Model import Model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


class NlpModel(Model):
    
    _text_feat_column = 'text_feat_column'
    _nlp_feat_column = 'nlp_feat_column'
    
    def __init__(self, df, text_features, features, segment_columns, kfold=5, predict_column='precio', drop_na=True):
        super().__init__(df, features, segment_columns, kfold, predict_column, False, drop_na=False)
        self.text_features = text_features
        self.df[self._text_feat_column] = df[text_features].astype(str).agg(' '.join, axis=1)
        if drop_na:
            self.df = self.df.dropna()
        
    def regresionar(self):
        # Creo la columna con los datos del estimador con NLP
        self._create_nlp_column()
        # Agrego esa columna como una feature mas para usar en la regresion lineal
        self.features.append(self._nlp_feat_column)
        super().regresionar()
        
    def _create_nlp_column(self):
        self.estimator = self._get_estimator()
        X, y = self._get_data_to_fit()
        self.estimator.fit(X, y)
        self.df[self._nlp_feat_column] = self.estimator.predict(X)
        
        
    def _get_estimator(self):
        params = {
            'count__max_features': 5000,
            'count__min_df': 5,
            'desc__n_components': 100,
            'reg__hidden_layer_sizes': (50, 20),
            'reg__max_iter': 50,
            'reg__solver': 'adam'
        }
        
        pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('desc', TruncatedSVD()),
            ('reg', MLPRegressor())
        ], verbose=True)
        
        pipeline.set_params(**params)
        
        return pipeline
    
    def _get_data_to_fit(self):
        X = self.df[self._text_feat_column].values

        # Escalado de datos a predecir
        scaler = StandardScaler(with_mean=False)
        to_predict = self.df[self.predict_column].values.reshape(1,-1)
        y = scaler.fit_transform(to_predict).reshape((-1,))
        
        return (X, y)

In [4]:
predict_column = 'precio'
carititud_column = "carititud"
segments = ['ciudad']
text_features = ['titulo','descripcion']
features = ['metrostotales']
model1 = NlpModel(train_df, text_features=text_features, features=features, segment_columns=segments)

In [5]:
model1.df

Unnamed: 0,metrostotales,banos,precio,text_feat_column
0,80.0,2.0,2273000.0,"depto. tipo a-402 depto. interior de 80.15m2, ..."
1,180.0,2.0,3600000.0,condominio horizontal en venta <p>entre sonora...
2,166.0,2.0,1200000.0,casa en venta urbi 3 recamaras tonala descripc...
3,67.0,1.0,650000.0,casa sola en toluca zinacantepec con credito i...
4,95.0,1.0,1150000.0,paseos del sol bonito departamento en excelent...
...,...,...,...,...
239993,150.0,3.0,1650000.0,bugambilias (ciudad) coto privado de tan solo ...
239994,90.0,2.0,1350000.0,hermosa casa en villa de los belenes <p>modern...
239996,250.0,3.0,1940000.0,casa en condominio a 10 min. del centro de tol...
239997,138.0,2.0,3400000.0,nicolas san juan departamento con excelente ub...


In [6]:
model1.regresionar()

[Pipeline] ............. (step 1 of 3) Processing count, total=  11.1s
[Pipeline] .............. (step 2 of 3) Processing desc, total=  23.1s




[Pipeline] ............... (step 3 of 3) Processing reg, total= 2.8min


In [7]:
model1.error_gral()

array([2344029240329.119, 0.2888972868153765, 0.2859870570697326,
       9977161.140802711], dtype=object)

In [14]:
def sonCaras(precios):
    mean = precios.mean()
    min_val = precios.min()
    max_val = precios.max()
    return np.array([esCara(p, mean, max_val, min_val) for p in precios])
    
def esCara(precio, mean, max_val, min_val):
    if precio > mean:
        if precio > mean + (max_val - mean) / 2:
            return 3
        else:
            return 2
    else:
        if precio > min_val + (mean - min_val) / 2:
            return 1
        else:
            return 0
        
train_df[carititud_column] = sonCaras(df_train[predict_column].values)


In [15]:
# df_train = df_train[["descripcion", predict_column, carititud_column]].dropna()
# df_test = df_test[["descripcion", predict_column, carititud_column]].dropna()

# x_train = df_train["descripcion"].values
# x_test = df_test["descripcion"].values
# y_train = df_train[carititud_column].values
# y_test = df_test[carititud_column].values

# y_train_scaled = scale(y_train) + 1 # Scale va de -1 a 1, con +1 va de 0 a 2, evito valores neg 
# y_test_scaled = scale(y_test) + 1 # Scale va de -1 a 1, con +1 va de 0 a 2, evito valores neg 

In [16]:
df = train_df[["descripcion", predict_column, carititud_column]].dropna()

X = df["descripcion"].values
# y = scale(df[carititud_column].values) + 1 # Scale va de -1 a 1, con +1 va de 0 a 2, evito valores neg 
y = df[carititud_column].values

x_train, x_test = train_test_split(X, test_size=.2)
y_train, y_test = train_test_split(y, test_size=.2)


## Buscando predecir el precio directamente

In [157]:
vectorizer = CountVectorizer(min_df=3, max_features=5000)
vectorizer.fit(x_train) # Notar que no le muestro el test 

x_train_transf = scale(vectorizer.transform(x_train), with_mean=False)
x_test_transf = scale(vectorizer.transform(x_test), with_mean=False)

In [158]:
# Saco componentes principales
tsvd = TruncatedSVD(100)
tsvd.fit(x_train_transf)

TruncatedSVD(n_components=100)

In [159]:
x_train_transf = tsvd.transform(x_train_transf)
x_test_transf = tsvd.transform(x_test_transf)

In [160]:
tsvd.explained_variance_.sum()

741.9866979147752

In [135]:
regr = MLPRegressor(
    solver='sgd', learning_rate='adaptive', early_stopping=True,
    hidden_layer_sizes=(100,200,50), max_iter=200, random_state=1, verbose=True)
regr.fit(x_train_transf, y_train)

Iteration 1, loss = 0.51546139
Validation score: -0.039834
Iteration 2, loss = 0.44509959
Validation score: -0.008313
Iteration 3, loss = 0.43801872
Validation score: -0.005355
Iteration 4, loss = 0.43694414
Validation score: -0.003124
Iteration 5, loss = 0.43620054
Validation score: -0.002753
Iteration 6, loss = 0.43584307
Validation score: -0.002084
Iteration 7, loss = 0.43568954
Validation score: -0.002472
Iteration 8, loss = 0.43561472
Validation score: -0.002264




MLPRegressor(early_stopping=True, hidden_layer_sizes=(100, 200, 50),
             learning_rate='adaptive', random_state=1, solver='sgd',
             verbose=True)

In [136]:
regr.loss_

0.435614723929101

In [64]:
print(regr.score(x_train_transf, y_train))
print(regr.score(x_test_transf, y_test))

0.07693442191101652
-0.022941062631672837


In [65]:
real = y_test
predicted = regr.predict(x_test_transf)
err = RMSE(predicted, real)
print("{:.2E}".format(err))
to_show = pd.DataFrame()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
to_show["real"] = real
to_show["pred"] = predicted
to_show

1.01E+00


Unnamed: 0,real,pred
0,0.539,1.072
1,0.214,0.951
2,0.330,0.940
3,0.097,0.945
4,0.309,1.242
...,...,...
1995,0.214,0.892
1996,0.283,1.070
1997,0.701,1.013
1998,0.840,1.083


In [137]:
sgd = SGDClassifier()
sgd.fit(x_train_transf, y_train)

SGDClassifier()

In [138]:
sgd.score(x_train_transf, y_train)

0.30765479486534103

In [139]:
sgd.score(x_test_transf, y_test)

0.30691947899406424

## Probando con Clasificadores

In [28]:

# A parameter grid for the pipeline
params = {
    # Bag of Words
    'count__min_df': [2, 5, 10],
    'count__max_features': [200, 500, 1000, 5000],

    # Descompositer
    'desc__n_components': [100, 500, 1000],

    # Classificator
    'clf__min_child_weight': [1, 5, 10],
    'clf__gamma': [0.5, 1, 1.5, 2, 5],
    'clf__subsample': [0.6, 0.8, 1.0],
    'clf__colsample_bytree': [0.6, 0.8, 1.0],
    'clf__max_depth': [3, 5, 7, 10],
    'clf__learning_rate': [0.01, 0.02, 0.05]    
}

folds = 3
param_comb = 1

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)


xgb = XGBClassifier(learning_rate=0.02, n_estimators=1000, 
                    silent=True, nthread=6, tree_method='gpu_hist')

pipeline = Pipeline([
    ('count', CountVectorizer()),
    ('desc', TruncatedSVD()),
    ('clf', xgb)
])

random_search = RandomizedSearchCV(
    pipeline, param_distributions=params, n_iter=param_comb, scoring=make_scorer(BAS), 
    n_jobs=-1, cv=skf.split(x_train,y_train), random_state=1001
)


## Resultado 
best_params = {
    'clf__colsample_bytree': 0.6,
    'clf__gamma': 2,
    'clf__learning_rate': 0.05,
    'clf__max_depth': 7,
    'clf__min_child_weight': 10,
    'clf__subsample': 1.0,
    'count__max_features': 1000,
    'count__min_df': 5,
    'desc__n_components': 100
}

In [29]:
random_search.fit(x_train[:100], y_train[:100])

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7ff824009c50>,
                   estimator=Pipeline(steps=[('count', CountVectorizer()),
                                             ('tfid', TfidfTransformer()),
                                             ('clf',
                                              XGBClassifier(base_score=None,
                                                            booster=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            gamma=None,
                                                            gpu_id=None,
                                                            importance_type='gain',
                                                            interaction_constraints=None,


In [188]:
estimator = random_search.best_estimator_

In [None]:
estimator.fit(x_train, y_train)

In [190]:
estimator.score(x_test, y_test)

0.37900874635568516

In [119]:
random_search.best_estimator_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [114]:
real = y_test
predicted = estimator.predict(x_test)
err = RMSE(predicted, real)
print("{:.2E}".format(err))
to_show = pd.DataFrame()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
to_show["real"] = real
to_show["pred"] = predicted
to_show[to_show["pred"]==3]

1.92E+00


Unnamed: 0,real,pred
30578,1,3


In [None]:
folds = 3
param_comb = 1

params = {
    
}

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)


xgb = XGBClassifier(learning_rate=0.02, n_estimators=1000, objective='binary:logistic',
                    silent=True, nthread=6, tree_method='gpu_hist', eval_metric='auc')

pipeline = Pipeline([
    ('count', CountVectorizer()),
    ('tfid', TruncatedSVD()),
    ('clf', XGBClassifier())
])

random_search = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=param_comb, cv=skf.split(x_train,y_train), n_jobs=4, verbose=3, random_state=1001 )

best_params_reg = {
    'count__max_features': 5000,
    'count__min_df': 5,
    'desc__n_components': 100,
    'reg__hidden_layer_sizes': (50, 20),
    'reg__max_iter': 50,
    'reg__solver': 'adam'
}