In [18]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import pickle as pkl

import optuna

In [19]:
data = pd.read_parquet('data_preprocessed.parquet')
data = data[data.Impact != 0]
X = data.drop('Impact', axis=1)
y = data['Impact']

In [20]:
data[data.Impact == 0].shape

(0, 73)

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138723 entries, 0 to 138723
Data columns (total 72 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   desc_length          138723 non-null  float64
 1   title_length         138723 non-null  float64
 2   embed_0              138723 non-null  float64
 3   embed_1              138723 non-null  float64
 4   embed_2              138723 non-null  float64
 5   embed_3              138723 non-null  float64
 6   embed_4              138723 non-null  float64
 7   embed_5              138723 non-null  float64
 8   embed_6              138723 non-null  float64
 9   embed_7              138723 non-null  float64
 10  embed_8              138723 non-null  float64
 11  embed_9              138723 non-null  float64
 12  embed_10             138723 non-null  float64
 13  embed_11             138723 non-null  float64
 14  embed_12             138723 non-null  float64
 15  embed_13         

In [22]:
cat_features = ['publisher_encoded','categories_encoded']

#### Hyperparameter Tuning
- Using Optuna along with MAPE as the main objective function
- using KFold cross validation with 3 splits

In [30]:
def objective(trial):
    # Define the hyperparameters space to tune
    param = {
        'iterations': trial.suggest_int('iterations', 100, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.05),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 10),

    }
    
    # Initialize K-Fold cross-validation
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    
    mape_scores = []
    
    # Perform K-Fold cross-validation
    for train_index, valid_index in kf.split(X):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        train_pool = Pool(X_train, y_train, cat_features=cat_features)
        valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)
        
        model = CatBoostRegressor(**param)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=False)
        
        y_pred = model.predict(X_valid)
        mape = mean_absolute_percentage_error(y_valid, y_pred)
        mape_scores.append(mape)
    
    # Return the mean MAPE score
    return sum(mape_scores) / len(mape_scores)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
best_params = study.best_params
best_value = study.best_value

In [32]:
study.best_params

{'iterations': 1925,
 'depth': 10,
 'learning_rate': 0.020709915830925664,
 'min_child_samples': 38,
 'subsample': 0.6685043598983783,
 'colsample_bylevel': 0.9431467353861616,
 'l2_leaf_reg': 1.099182957889465}

In [33]:
study.best_value


0.05552452805585498

In [23]:
# using the best parameters found by optuna after rounding them off.
best_params = {
    'iterations': 2000,  
    'depth': 10,        
    'learning_rate': 0.02,  
    'min_child_samples': 38,
    'subsample': 0.7,
    'colsample_bylevel': 0.9,
    'l2_leaf_reg': 1.1,   
    'random_seed': 42
}

In [24]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mape_scores = []  

In [25]:
counter = 0

This section of the code implements a 5-fold cross-validation training process for a CatBoost regression model. ]



1. **Cross-Validation:**
   - A 5-fold cross-validation is performed using KFold from scikit-learn.
   - The data is split into training and validation sets for each fold.

2. **Model Training:**
   - For each fold:
     - CatBoost Pool objects are created for training and validation data.
     - A CatBoostRegressor is initialized with the best parameters found by Optuna.
     - The model is trained using the training pool and evaluated on the validation pool.
     - Early stopping is applied with a patience of 50 rounds.

3. **Performance Evaluation:**
   - Mean Absolute Percentage Error (MAPE) is calculated for each fold.
   - The MAPE scores are printed for each fold and averaged at the end.



4. **Final Results:**
   - The average cross-validation MAPE is printed, giving an estimate of the model's performance.

This process ensures a robust evaluation of the model's performance across different subsets of the data, helping to assess its generalization capability.


In [26]:
X = X.drop(columns=['main_author_encoded'],axis=1) # dropping main_author_encoded as it did not lead to any improvement but increased training time
for train_index, valid_index in kf.split(X):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_pool = Pool(X_train, y_train,cat_features=cat_features)
    valid_pool = Pool(X_valid, y_valid,cat_features=cat_features)
    
    model = CatBoostRegressor( **best_params, loss_function='MAPE')
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=False)
    counter += 1
    
    y_pred = model.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, y_pred)
    print("cv-mape",mape)
    mape_scores.append(mape)

    #pkl.dump(model,open('model_catboost-{counter}-{mape}.pkl'.format(counter=counter,mape=mape),'wb'))
    
print("avg_cv-mape",sum(mape_scores) / len(mape_scores))

cv-mape 0.058825163165431245
cv-mape 0.05665670960545309
cv-mape 0.057532631424358174
cv-mape 0.057321023624930155
cv-mape 0.06380823981148162
avg_cv-mape 0.05882875352633086


#### Training the final model on the entire dataset using the best parameters found by optuna

In [None]:
# Train final model on all data using best parameters
print("Training final model on all data...")


final_train_pool = Pool(X, y, cat_features=cat_features)
final_model = CatBoostRegressor(**best_params, loss_function='MAPE')
final_model.fit(final_train_pool, verbose=False)

# Save the final model
final_model_filename = 'final_model_catboost.pkl'
pkl.dump(final_model, open(final_model_filename, 'wb'))
print(f"Final model saved as {final_model_filename}")


feature_importances = final_model.get_feature_importance(final_train_pool)
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False)



In [29]:
pd.set_option('display.max_rows', None)
importance_df

Unnamed: 0,feature,importance
66,publisher_encoded,22.075848
67,categories_encoded,17.229149
25,embed_23,10.687276
10,embed_8,7.06351
61,embed_59,6.346241
24,embed_22,3.061903
34,embed_32,1.386646
64,embed_62,1.309167
68,published_year,1.305364
20,embed_18,1.267009
