# Importo Librerias

In [2]:
import numpy as np
from catboost import CatBoostRegressor, Pool
import datetime
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import optuna
import pandas as pd
from optuna.samplers import TPESampler
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder, StandardScaler

# Functions

In [3]:
# Functions

class Data_Operations():
    def __init__(self) -> None:
        pass
    
    def load_data(dataset):
        '''
        Esta función carga el dataset original para iniciar el preprocesado

        Input: Ruta de acceso (paths[0]) + archivo csv (data_files[0])
        Output: DataFrame Pandas (df)
        '''
        df = pd.read_csv(dataset)
        return df

    def clean_nans(df):
        '''
        Esta función elimina todos los NaN, ya que al estar la mayoria en la variable 'Rating' (Nuestro Target para
        la predicción), carece de sentido estimar sus valores durante el preprocesado y contamos con un volumen de 
        datos suficiente.
        
        Input: Dataframe original cargado desde el csv (df).
        Output: Otro Dataframe sin valores NaN.
        '''
        df_No_NaNs = df.copy().dropna(subset=["Rating","Content Rating", "Current Ver", "Android Ver"])
        df_No_NaNs.to_csv(r'..\data\processed_files\df_No_NaNs.csv', index = False)
        return df_No_NaNs
    
    def dataset_preprocessing(df_No_NaNs):
        '''
        Esta función limpia el dataset inicial para poder trabajar con el modelo (CatBoost).
        Transforma y cambia el tipo de:
        Crea una nueva variable a partir de:
        Elimina las variables:

        Input: dataset
        Output: Dataset limpio y listo para trabajar con el modelo (CatBoost)
        '''
        df_No_NaNs["Reviews"] = df_No_NaNs["Reviews"].astype('int64')
        df_No_NaNs["Size"] = df_No_NaNs["Size"].replace(['Varies with device'],['14000']).apply(lambda x: float(x.replace('M','')) *1000 if 'M' in x else (float(x.replace('k','')) /1000 if 'k' in x else x)).astype('float64')
        df_No_NaNs["Installs"] = df_No_NaNs["Installs"].str.split('+',expand=True)[0].apply(lambda x: x.replace(',','')).astype('int64')
        df_No_NaNs["Main_Genre"] = df_No_NaNs["Genres"].str.split(';',expand=True)[0]
        df_No_NaNs["Last Updated"] = df_No_NaNs["Last Updated"].apply(lambda x: x.replace(' ','/').replace(',','').replace('January','1').replace('February','2').replace('March','3').replace('April','4').replace('May','5').replace('June','6').replace('July','7').replace('August','8').replace('September','9').replace('October','10').replace('November','11').replace('December','12')).astype('datetime64')
        timestamp = pd.Timestamp(datetime.datetime(2021, 10, 10))
        df_No_NaNs['Today'] = pd.Timestamp(timestamp.today().strftime('%d-%m-%Y'))
        df_No_NaNs['Days_Since_Last_Update'] = (df_No_NaNs['Today'] - df_No_NaNs["Last Updated"]).dt.days
        df_No_NaNs.drop(['App','Type','Price','Genres','Today','Last Updated'], axis=1, inplace=True)
        df_Preprocessed = df_No_NaNs.copy()
        df_Preprocessed.to_csv(r'..\data\processed_files\df_Preprocessed.csv', index = False)
        return df_Preprocessed

class Model_Operations():
    def __init__(self) -> None:
        pass

    def split_test(df):
        X_train, X_test, y_train, y_test = train_test_split(df.drop('Rating', axis=1),
                                                        df['Rating'],
                                                        test_size=0.3,
                                                        random_state=42)
        return X_train, X_test, y_train, y_test
    
    def pre_fit(X_train, X_test, y_train, y_test):
        pool_train = Pool(X_train, y_train,
                            cat_features=['Category','Content Rating','Current Ver','Android Ver','Main_Genre'])
        pool_test = Pool(X_test, y_test,
                        cat_features=['Category','Content Rating','Current Ver','Android Ver','Main_Genre'])
        return pool_train, pool_test
             
    def train_model(train):
        cb = CatBoostRegressor(n_estimators=1000,
                      loss_function='RMSE',
                      learning_rate=0.1,
                      random_state=1,
                      verbose=False
                      )
        cb.fit(train)
        model_trained = cb
        return model_trained

    def rmse_score(model_trained, X,y):
        predict = model_trained.predict(X)
        rmse = mean_squared_error(y, predict)
        return rmse
    
    def save_best_model(model_trained):
        timestamp = pd.Timestamp(datetime.date(2021, 10, 10))
        fecha_hoy = timestamp.today().strftime(('%d-%m-%Y'))
        joblib.dump(model_trained, r'model\best_app_rating_model-'+fecha_hoy+'.pkl')

    def open_last_saved_model(pickle):
        final_model_reloaded = joblib.load(pickle)
        return final_model_reloaded
    
    def prepare_full_dataset(df_Preprocessed):
        X_full = df_Preprocessed.drop('Rating', axis=1)
        y_full = df_Preprocessed['Rating']
        return X_full, y_full

    def pre_fit_full_dataset(X_full, y_full):
        pool_full_dataset = Pool(X_full, y_full,
                            cat_features=['Category','Content Rating','Current Ver','Android Ver','Main_Genre'])
        return pool_full_dataset

    def save_best_model_full_dataset(model_trained_full_dataset):
        timestamp = pd.Timestamp(datetime.date(2021, 10, 10))
        fecha_hoy = timestamp.today().strftime(('%d-%m-%Y'))
        joblib.dump(model_trained_full_dataset, r'model\best_app_rating_model_full_training-'+fecha_hoy+'.pkl')

# Preprocesado

In [4]:
# Cargo el Dataset Original

df = Data_Operations.load_data(r'..\data\raw_files\googleplaystore.csv')
print('Dataset Cargado')
# Elimino los NaN (La mayoria estan en el target y tenemos datos suficientes).

df_No_NaNs = Data_Operations.clean_nans(df)
print('Dataset sin NaNs')

# Completo el preprocesado de los datos 

df_Preprocessed = Data_Operations.dataset_preprocessing(df_No_NaNs)
print('Preprocesado acabado')

Dataset Cargado
Dataset sin NaNs
Preprocesado acabado


Transformer

In [22]:
oH_cat_pipeline = Pipeline(
    [("OHEncoder", OneHotEncoder())
    ]
)

ord_cat_pipeline = Pipeline(
    [("OrdEncoder", OrdinalEncoder())
    ]
)

num_pipeline = Pipeline(
    [("SScaler", StandardScaler())
    ]
)

In [23]:
preprocessing = ColumnTransformer(
    [("OH_Process_Categorical", oH_cat_pipeline, ["Category","Content Rating","Main_Genre"]),
     ("Ord_Process_Categorical", ord_cat_pipeline, ["Current Ver","Android Ver"]),
     ("Process_Num_Scaler", num_pipeline, ["Reviews","Size","Installs","Days_Since_Last_Update"]),
    ], remainder = "passthrough")

In [27]:
X_train

Unnamed: 0,Category,Reviews,Size,Installs,Content Rating,Current Ver,Android Ver,Main_Genre,Days_Since_Last_Update
6816,FAMILY,7,5600.000,10,Everyone,1.0,4.0.3 and up,Education,2126
1094,FINANCE,11919,23000.000,1000000,Everyone,1.13.2.11,4.4 and up,Finance,1894
4089,FAMILY,148536,59000.000,10000000,Everyone,4.21.1,4.1 and up,Education,1887
4144,SOCIAL,4828372,14000.000,1000000000,Teen,Varies with device,Varies with device,Social,1895
10050,TOOLS,164,0.144,5000,Everyone,1.3,1.6 and up,Tools,3125
...,...,...,...,...,...,...,...,...,...
6144,PRODUCTIVITY,16,2600.000,1000,Everyone,1.0.3,2.3 and up,Productivity,2419
5516,MAPS_AND_NAVIGATION,616742,14000.000,10000000,Everyone,Varies with device,Varies with device,Maps & Navigation,1885
5728,GAME,955614,82000.000,10000000,Teen,5.1.2,3.0 and up,Action,1925
913,ENTERTAINMENT,92058,12000.000,10000000,Teen,4.8.6,4.1 and up,Entertainment,1901


In [31]:
pipe_preprocessed = preprocessing.fit_transform(df_Preprocessed)

In [32]:
df_pipe_train = pd.DataFrame(pipe_preprocessed_train.toarray(), columns= preprocessing.get_feature_names_out())

In [34]:
df_pipe_train.head()


Unnamed: 0,OH_Process_Categorical__Category_ART_AND_DESIGN,OH_Process_Categorical__Category_AUTO_AND_VEHICLES,OH_Process_Categorical__Category_BEAUTY,OH_Process_Categorical__Category_BOOKS_AND_REFERENCE,OH_Process_Categorical__Category_BUSINESS,OH_Process_Categorical__Category_COMICS,OH_Process_Categorical__Category_COMMUNICATION,OH_Process_Categorical__Category_DATING,OH_Process_Categorical__Category_EDUCATION,OH_Process_Categorical__Category_ENTERTAINMENT,...,OH_Process_Categorical__Main_Genre_Video Players & Editors,OH_Process_Categorical__Main_Genre_Weather,OH_Process_Categorical__Main_Genre_Word,Ord_Process_Categorical__Current Ver,Ord_Process_Categorical__Android Ver,Process_Num_Scaler__Reviews,Process_Num_Scaler__Size,Process_Num_Scaler__Installs,Process_Num_Scaler__Days_Since_Last_Update,remainder__Rating
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,107.0,14.0,-0.163511,-0.110731,-0.196126,-0.09708,4.1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,952.0,14.0,-0.163254,-0.34241,-0.190757,-0.117274,3.9
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,424.0,14.0,-0.135735,-0.587989,-0.141448,-0.61709,4.7
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2582.0,17.0,-0.094991,0.167283,0.351641,-0.480776,4.5
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,250.0,19.0,-0.163254,-0.86137,-0.19514,-0.511068,4.3


## Train Test Split

In [36]:
# Divido el dataset preprocesado en train y test
X_train, X_test, y_train, y_test = train_test_split(df_pipe_train.drop('remainder__Rating', axis=1),
                                                    df_pipe_train['remainder__Rating'],
                                                    test_size=0.3,
                                                    random_state=42)

In [37]:
cb_pipe = CatBoostRegressor(n_estimators=1000,
                      loss_function='RMSE',
                      learning_rate=0.1,
                      random_state=1,
                      verbose=False
                      )

pool_train_pipe = Pool(X_train, y_train)
pool_test_pipe = Pool(X_test)

cb_pipe.fit(pool_train_pipe)

y_pred_pipe_train = cb_pipe.predict(pool_train_pipe)
y_pred_pipe_test = cb_pipe.predict(pool_test_pipe)

In [38]:
print(mean_absolute_error(y_train, y_pred_pipe_train))
print(mean_squared_error(y_train, y_pred_pipe_train))
print(mean_absolute_error(y_test, y_pred_pipe_test))
print(mean_squared_error(y_test, y_pred_pipe_test))

0.19263878378477783
0.07146998208544443
0.3071309336830532
0.21519489076639697


Overfitea Demasiado: Descarto el preprocesamiento