In [105]:
# Importacion de paquetes necesarios
import pandas as pd
import numpy as np
import csv
import os
from pathlib import Path
import datetime
from datetime import date
from imblearn.over_sampling import SMOTE
import random
from nltk.corpus import stopwords
import pickle
from sklearn.metrics import f1_score
import requests, json
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


#### Exportamos el dataset limpio

In [85]:
df = pd.read_csv("data_predictive/clean_sold.csv", index_col=0)


#### Seteamos la Seed

In [86]:
#### Apparently you may use different seed values at each stage
seed_value= 0
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# 4. Set the `tensorflow` pseudo-random generator at a fixed value

#### En base a los experimentos realizados en la carpeta de Validaciones, vamos a generar el modelo con OverSampling con SMOTE y RAndom forest con los mejores hiperamentros encontramos durante la hiperoptimización.

In [92]:
y = df["sold_quantity"]
X = df.drop(["sold_quantity"],axis=1)

smote = SMOTE()
X_sm, y_sm = smote.fit_sample(X, y)

X_sm_sample = X_sm.sample(frac=0.3)
y_sm_sample = y_sm[X_sm_sample.index]

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sm_sample, y_sm_sample, train_size=0.7)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

#### Elijo los mejores hiperparametros encontrados en la validacion del modelo Random Forest con Oversampling

In [93]:
param={'bootstrap': False, 'criterion': 'entropy', 'max_depth': 200, 'max_features': 0.5, 'n_estimators': 650}

In [94]:
rf = RandomForestClassifier(**param)
rf.fit(X_train_s,y_train_s)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=200,
                       max_features=0.5, n_estimators=650)

#### Entrenando el modelo con los datos oversampleados y evaluamos en las clases totales

In [108]:
filename = 'data_predictive/Model_RFover'
outfile = open(filename,'wb')
pickle.dump(rf,outfile)
outfile.close()

In [96]:
predictions = rf.predict(X_test_s)
f1_score(y_test_s,predictions,average="weighted")

0.8702750253859802

#### Para verificar que no estamos prediciendo el mismo algoritmo que usamos para oversamplear calculamos el Score F1 para predecir la muestra de Testeo con todas muestras verdaderas. Este es el Score más representativo de nuestro modelo y supera ampliamente el valor summy de 50% y el modelo sin optimizar ni oversamplear de 72%

In [97]:
predictions = rf.predict(X_test)

In [98]:
f1_score(y_test,predictions,average="weighted")

0.8337900130191396

### Pipeline para nuevas publicaciones:

#### Creo las funciones auxiliares para hacer las transforamciones necesarias:

In [151]:
def mowing_json(dict_input): # Descartamos ramas del json que no son de interes en el analisis.
    dict_mowing  = dict_input
    dict_mowing.pop("attributes", None)
    dict_mowing.pop("pictures", None)
    dict_mowing.pop("permalink", None)
    dict_mowing.pop("variations", None)
    dict_mowing.pop("thumbnail", None)
    dict_mowing.pop("sale_terms", None)
    dict_mowing.pop("secure_thumbnail", None)
    dict_mowing.pop("video_id", None)
    dict_mowing.pop("subtitle", None)
    dict_mowing.pop("site_id", None)
    dict_mowing.pop("seller_address", None)
    dict_mowing.pop("official_store_id", None)
    dict_mowing.pop("warnings", None)
    dict_mowing.pop("seller_contact", None)
    dict_mowing.pop("international_delivery_mode", None)
    dict_mowing.pop("sub_status", None)
    dict_mowing.pop("catalog_product_id", None)
    dict_mowing.pop("parent_item_id", None)
    dict_mowing.pop("differential_pricing", None)
    dict_mowing.pop("listing_source", None)
    dict_mowing.pop("location", None)
    dict_mowing.pop("non_mercado_pago_payment_methods", None)
    dict_mowing.pop("coverage_areas", None)
    dict_mowing.pop("deal_ids", None)
    dict_mowing.pop("seller_id", None)
    dict_mowing.pop("descriptions", None)
    return dict_mowing

def categ_sold(x):
    if 5<= x<=50:
        x=5
    elif 51<= x<=100:
        x=50
    elif 101<= x<=150:
        x=100
    elif 151<= x<=200:
        x=150
    elif 201<= x<=250:
        x=200
    elif 251<= x<=500:
        x=250    
    elif 501<= x<=5000:
        x=500    
    elif 5001<= x<=50000:
        x=5000      
    return x


def create_dummis(df, values,prefix=""):
    column = df["buying_mode"][0]
    for x in values:
        if x == column:
            df[f"{prefix}{x}"] = 1
        else:
            df[f"{prefix}{x}"] = 0


#### Funcion que transforma el Json de una publicacion ID y aplica el modelo y devuelve la predcción.

In [162]:
def predict_sold(sample_id,model):
    #Conversion Json to DataFrame
    query = requests.get(f"https://api.mercadolibre.com/items/{sample_id}")
    item = json.loads(query.content)
    df_json = [mowing_json(x) for x in [item]] 
    df = pd.json_normalize(df_json,sep="_") # Transformo Json a DataFrame
    df = df.infer_objects()  # Inferimos el mejor tipo de dato para la columna
    df = df.replace('', np.nan) # Reemplazo espacios vacios por NaN
    
    # Conversion Variables Categoricas
    df["listing_type_id"] =df["listing_type_id"].map({'gold_special':6, 'gold_premium': 5, "gold_pro": 4,'gold':3, 'silver': 2, "bronze": 1,"free": 1})
    buying_mode = ["buy_it_now","classified"]
    currency = ["ARS","USD"]
    log= ["cross_docking","custom","default","drop_off","fulfillment","not_specified","xd_drop_off"]
    shippinmode = ["custom","me1","me2","not_specified"]
    status = ["active","closed","inactive","paused","under_review"]
    condition = ["new","not_specified","used"]
    create_dummis(df, buying_mode,prefix="bmode__")
    create_dummis(df, currency,prefix ="currency__")
    create_dummis(df, log ,prefix="slog__")
    create_dummis(df, shippinmode,prefix="smd__")
    create_dummis(df, status)
    create_dummis(df, condition)
    df = df.drop(['buying_mode',"currency_id","shipping_logistic_type","shipping_mode","status","condition"],axis=1,errors="ignore")
    
    # Imputacion de Variables
    df["health"].fillna(0.8,inplace=True)
    df["original_price"].fillna(0,inplace=True)
    df["geolocation_latitude"].fillna(0,inplace=True)
    df["geolocation_longitude"].fillna(0,inplace=True)
    df = df.drop(df[df["available_quantity"].isnull()].index)
    
    #Transformacion Features temporales
    datetime_features = ["start_time","stop_time","last_updated","date_created"]
    call_date = datetime.datetime(2020, 8,1)
    for feat in datetime_features:
        df[feat] = pd.to_datetime(df[feat], utc=True).dt.tz_localize(None)
        df[f"{feat}_days"] = (df[feat]- call_date).apply(lambda x : x.days)
    df.drop(datetime_features, axis= 1 ,inplace =True, errors='ignore')
    stopwords_es =  stopwords.words('spanish')
    
    #Transformacion title
    def title2word( title):
        return len(([x for x in title.strip().lower().split(" ") if x not in stopwords_es]))
    df["title_words"] = df["title"].apply(title2word)
    
    #Transformacion Tags
    df = df.drop(df[df.duplicated(subset=['id'],keep="first")].index)
    df["tags"] = df["tags"].apply(len)
    df["shipping_tags"] = df["shipping_tags"].apply(len)
    
    # Limpieza de columnas
    df=df.drop(["tags","domain_id","shipping_tags","id","title","warranty","category_id","shipping_dimensions","shipping_methods","shipping_free_methods"],axis=1,errors="ignore")
    df = df.drop(df[df["price"].isnull()].index)
    df["sold_quantity"]=df["sold_quantity"].apply(lambda x: categ_sold(x)).astype("int")
    df.reset_index(inplace=True,drop=True)
    predictions = model.predict(a.drop("sold_quantity",axis=1))

    return predictions

In [165]:
predict_sold("MLA842101865",rf)

array([250], dtype=int64)