In [7]:
import numpy as np
from catboost import CatBoostRegressor, Pool
import datetime
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import optuna
import pandas as pd
from optuna.samplers import TPESampler
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder, StandardScaler

In [12]:
def dataset_cleaning(dataset):
    df = pd.read_csv(dataset)
    df2 = df.copy().dropna(subset=["Rating","Content Rating", "Current Ver", "Android Ver"])
    df2["Reviews"] = df2["Reviews"].astype('int64')
    df2["Size"] = df2["Size"].replace(['Varies with device'],['14000'])
    df2["Size"] = df2["Size"].apply(lambda x: float(x.replace('M','')) *1000 if 'M' in x else (float(x.replace('k','')) /1000 if 'k' in x else x))
    df2["Size"]=df2["Size"].astype('float64')
    df2["Installs"] = df2["Installs"].str.split('+',expand=True)[0]
    df2["Installs"] = df2["Installs"].apply(lambda x: x.replace(',',''))
    df2["Installs"] = df2["Installs"].astype('int64')
    df2["Main_Genre"] = df2["Genres"].str.split(';',expand=True)[0]
    df2["Last Updated"] = df2["Last Updated"].apply(lambda x: x.replace(' ','/').replace(',','').replace('January','1').replace('February','2').replace('March','3').replace('April','4').replace('May','5').replace('June','6').replace('July','7').replace('August','8').replace('September','9').replace('October','10').replace('November','11').replace('December','12'))
    df2["Last Updated"] = df2["Last Updated"].astype('datetime64')
    timestamp = pd.Timestamp(datetime.datetime(2021, 10, 10))
    df2['Today'] = pd.Timestamp(timestamp.today().strftime('%d-%m-%Y'))
    df2['Days_Since_Last_Update'] = (df2['Today'] - df2["Last Updated"]).dt.days
    df2.drop(['App','Type','Price','Genres','Today','Last Updated'], axis=1, inplace=True)
    df3 = df2.copy()
    return df3

In [14]:
df_clean = dataset_cleaning('googleplaystore.csv')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df_clean.drop('Rating', axis=1),
                                                    df_clean['Rating'],
                                                    test_size=0.3,
                                                    random_state=42)

In [16]:
cb = CatBoostRegressor(n_estimators=1000,
                      loss_function='RMSE',
                      learning_rate=0.1,
                      random_state=1,
                      verbose=False
                      )

pool_train = Pool(X_train, y_train,
                 cat_features=['Category','Content Rating','Current Ver','Android Ver','Main_Genre'])
pool_test = Pool(X_test,
                  cat_features=['Category','Content Rating','Current Ver','Android Ver','Main_Genre'])

cb.fit(pool_train)

y_pred_train = cb.predict(pool_train)
y_pred_test = cb.predict(pool_test)

In [17]:
print(mean_absolute_error(y_train, y_pred_train))
print(mean_squared_error(y_train, y_pred_train))
print(mean_absolute_error(y_test, y_pred_test))
print(mean_squared_error(y_test, y_pred_test))

0.22415686670617946
0.11672451608513691
0.30362632714960647
0.20785788182815668


In [19]:
y_full = df_clean['Rating']
pool_full = Pool(df_clean.drop('Rating', axis=1),
                  cat_features=['Category','Content Rating','Current Ver','Android Ver','Main_Genre'])

y_pred_full = cb.predict(pool_full)

In [20]:
print(mean_absolute_error(y_full, y_pred_full))
print(mean_squared_error(y_full, y_pred_full))

0.24799770483920755
0.1440645258080428


In [21]:
import joblib

joblib.dump(cb, "best_app_rating_model.pkl")

['best_app_rating_model.pkl']