In [44]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer

from sklearn.metrics import r2_score, root_mean_squared_log_error, root_mean_squared_error, mean_tweedie_deviance

import phik
from phik.report import plot_correlation_matrix

from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

import plotly.express as px
import plotly.io as pio

RSEED = 42

In [None]:
df = pd.read_csv('../data/data_nlp_A.csv', parse_dates=['last_publish_date', 'date_min'])
df.columns

In [None]:
df.page_id = df.page_id.astype('category')

In [None]:
print('Numerical columns')
df.select_dtypes(include=['int', 'float']).columns

In [None]:
print('Categorical columns')
df.select_dtypes(include=['object', 'datetime']).columns

In [None]:
numerical_features = ['no_versions', 'word_count', 'merged_url_len', 
                      'h1_len', 'abstract_len', 'likes_n_days', 'dislikes_n_days', 
                      'video_play', 'mean_version_lifetime',
                      'meta_title_len', 'meta_desc_len', 
                      'confidence_abstract', 'confidence_meta_title']

categorical_features = ['classification_product', 'classification_type', 'sentiment_abstract', 'sentiment_meta_title', 'media_type']

targets = ['external_clicks', 'external_impressions', 'ctr', 'clickouts', 'ext_impr_norm']

# Modelling success for each article

Warning: since only a few articles are really successful (like it is usually with viral things), **an extensive EDA on the outliers and methods of anomaly detection are likely the way to go!

In [None]:
pd.concat((df[categorical_features].describe(include='object').T, # categorical (strings)
           df[numerical_features].describe().T)).style.format(precision=0) # numericals

In [None]:
phik_matrix = df[categorical_features + targets].phik_matrix(interval_cols=list(targets))
plot_correlation_matrix(phik_matrix.values, x_labels=phik_matrix.columns, y_labels=phik_matrix.index, 
                        vmin=0, vmax=1, color_map='BuPu', fontsize_factor=0.75,
                        title=r'$\phi_K$ correlation of the features',
                        figsize=(8,6.5))

In [None]:
pears_matrix = df[numerical_features + targets].corr()
plot_correlation_matrix(pears_matrix.values, x_labels=pears_matrix.index, y_labels=pears_matrix.columns,
                        vmin=0, vmax=1, color_map='BuPu', fontsize_factor=0.75,
                        title=r'Pearson correlation of the features',
                        figsize=(10,8.5))

## Baseline

In [None]:
# features and target variable
num_bl = ['no_versions', 'word_count', 'h1_len', 'abstract_len', 
          #'likes_n_days', 'dislikes_n_days', 'video_play', 
          'mean_version_lifetime']
cat_bl = ['classification_product', 'classification_type', 'media_type', 'sentiment_abstract', 'sentiment_meta_title']
target = targets[0]

X = df[num_bl + cat_bl]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

In [None]:
X_train.hist()
plt.tight_layout()

In [None]:
## for numerical features
skewed_feats_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('pow', PowerTransformer())
])
# norm_features = Pipeline([
#     ('scale_num', StandardScaler())
# ])

cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(drop='first', handle_unknown='ignore')),
])

preprocessor = ColumnTransformer([
    #('scale_num', norm_features, ['no_versions', 'word_count', 'h1_len', 'abstract_len']),
    ('pow', skewed_feats_pipeline, num_bl),#['likes_n_days', 'dislikes_n_days','video_play', 'mean_version_lifetime']),
    ('cat', cat_pipeline, cat_bl)
])
# Initiating Pipelines
## for categorical features
powtr = PowerTransformer()

# Target transform
y_train_t = powtr.fit_transform(y_train.to_frame())
y_test_t = powtr.transform(y_test.to_frame())
# Reverse the transform when deciphering predictions!

pipe_tree = Pipeline([
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeRegressor())
])

In [None]:
X_train_tr = preprocessor.fit_transform(X_train)
X_train_tr_df = pd.DataFrame(data=np.asarray(X_train_tr.todense()), columns=preprocessor.get_feature_names_out(), index=X_train.index)
X_train_tr_df[[col for col in X_train_tr_df.columns if col[:3]=='pow']].hist();
plt.gcf().suptitle('Power-transformed features only')
plt.tight_layout()

In [None]:
X_train.isna().sum()

In [None]:
# cat_pipeline = Pipeline([
#     ('1hot', OneHotEncoder(handle_unknown='ignore')),
# ])

# preprocessor = ColumnTransformer([
# #    ('num', norm_features, ['no_versions', 'word_count', 'likes_n_days', 'merged_url_len',
# #    'mean_version_lifetime']),
#     ('cat', cat_pipeline, cat_bl)
# ])



In [None]:
pipe_tree.fit(X_train, y_train_t)

In [None]:
pred = pipe_tree.predict(X_test)

In [None]:
pred

In [None]:
y_test_t_1d = y_test_t.reshape(y_test_t.shape[0])

In [None]:
pred

## Evaluation

In [None]:
fig_bl_pred = px.scatter(x=y_test_t_1d, y=pred, trendline='ols', 
                         trendline_color_override='darkred')
print('Features used:')
print('Numerical:', num_bl)
print('Categorical', cat_bl)
fig_bl_pred.add_annotation(x=-4, y=4,
            text=f'''Features used:<br>Numerical: {num_bl}<br>Categorical: {cat_bl}''',
            showarrow=False,
            yshift=10);

In [None]:
fig_bl_pred.update_layout(height=800, width=700, title='Baseline predictions', 
                          yaxis_title='y predicted (transformed)', 
                          xaxis_title='y test (transformed)',
                          margin={'t': 130})

fig_bl_pred.update_layout(
    annotations=[
        dict(
            x=-0.07,  # X-coordinate outside the plot area
            y=1,  # Y-coordinate (top of the plot)
            xref="paper",
            yref="paper",
            xanchor="left",
            yanchor="bottom",
            text=f'''Features used:<br>Numerical: {num_bl}<br>Categorical: {cat_bl}''',
            align="left",
            showarrow=False,
            )
    ]
)
fig_bl_pred.show()

In [None]:
import plotly.io as pio

pio.write_image(fig_bl_pred, file='../models/baseline_katja.jpg')

In [None]:
#from sklearn.tree import plot_tree
#plot_tree(pipe_tree.steps[-1][1])

bl_tree = pipe_tree.steps[-1][1]
bl_tree.get_n_leaves(), bl_tree.get_depth()

In [None]:
from pickle import dump

dump(pipe_tree, open('../models/bl_katja.pkl', 'wb'))

In [None]:
feat_imp = bl_tree.feature_importances_
feat_names = pipe_tree[:-1].get_feature_names_out()

#feature_importance = np.array([(name, coeff) for name, coeff in sorted(zip(feat_names, feat_imp), key=lambda x: x[1])])
feature_importance = pd.DataFrame().from_records(data={'Features': [name.split('__')[-1] for name in feat_names], 'Importance': feat_imp})
sorted_importances = feature_importance.sort_values('Importance', ascending=True)
#feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

In [None]:
fig_importance = px.bar(sorted_importances, y='Features', x='Importance',
                        orientation='h', height=800, width=800, 
                        #xaxis_label_text='', yaxis_label_text=''
                        )
fig_importance.show()

In [None]:
pio.write_image(fig_importance, file='../models/baseline_katja_features.jpg')

In [None]:
import pandas as pd
from pickle import load
from sklearn.model_selection import train_test_split

# Run the script 5A_sentiment_merge.py

df = pd.read_csv('../data/data_nlp_A.csv', parse_dates=['last_publish_date', 'date_min'])

# features and target variable
num_bl = ['no_versions', 'word_count', 'h1_len', 'abstract_len', 'mean_version_lifetime']
cat_bl = ['classification_product', 'classification_type', 'media_type', 'sentiment_abstract', 'sentiment_meta_title']
target = 'external_clicks'

X = df[num_bl + cat_bl]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bl_katja = load(open('../models/bl_katja.pkl', 'rb'))

pred = bl_katja.predict(X_test)

In [None]:
y_train_t = powtr.fit_transform(y_train.to_frame())
y_test_t = powtr.transform(y_test.to_frame())

In [None]:
pred_nat = powtr.inverse_transform(pd.DataFrame(data=pred, columns=y_test.to_frame().columns))

In [None]:
pred_nat.shape

In [None]:
fig_bl_pred_real = px.scatter(x=y_test.values, y=pred_nat.reshape(pred_nat.shape[0]), 
                              #trendline='ols', trendline_color_override='darkred',
                              labels={'x': 'y test', 'y': 'predicted target'}
                              )
fig_bl_pred_real.show()

In [None]:
fig_bl_pred_real = px.scatter(x=y_test.values, y=pred_nat.reshape(pred_nat.shape[0])-y_test.values, 
                              #trendline='ols', trendline_color_override='darkred'
                              )
fig_bl_pred_real.show()

## Same model for other targets

In [1]:
import pandas as pd
from pickle import load
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/data_nlp_A.csv', parse_dates=['last_publish_date', 'date_min'])

# features and target variable
num_bl = ['no_versions', 'word_count', 'h1_len', 'abstract_len', 'mean_version_lifetime']
cat_bl = ['classification_product', 'classification_type', 'media_type', 'sentiment_abstract', 'sentiment_meta_title']
target = 'ext_impr_norm'

X = df[num_bl + cat_bl]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bl_katja = load(open('../models/bl_katja.pkl', 'rb'))

powtr = PowerTransformer()
y_train_t = powtr.fit_transform(y_train.to_frame())
y_test_t = powtr.transform(y_test.to_frame())

bl_katja.fit(X_train, y_train_t)

NameError: name 'PowerTransformer' is not defined

In [111]:
pred = bl_katja.predict(X_test)
pred

array([ 1.03136614, -0.52618174, -0.45653074, ..., -1.17664817,
       -0.84996561, -0.28876372])

In [112]:
r2_score(y_true=y_test_t.reshape(pred.shape), y_pred=pred)
#r2_score(y_true=y_test.values, y_pred=pred)

-0.0031060430041365272

In [113]:
px.scatter(x=y_test_t.reshape(pred.shape), y=pred)
#px.scatter(x=y_test.values, y=pred)

In [4]:
import plotly.express as px
import plotly.io as pio
import pandas as pd

def plot_feature_importance(pipeline, save_as=False):
    """Parameters:
    - pipeline: must be an sklearn object (or something that supports sklearn API) that combines a preprocessor and a model
    - save_as: False if you don't wanna save the figure, filename if you do
    Returns:
    - fig
    """
    feat_imp = pipeline.steps[-1][1].feature_importances_
    feat_names = pipeline[:-1].get_feature_names_out()

    feature_importance = pd.DataFrame().from_records(
        data={'Features': [name.split('__')[-1] for name in feat_names], 'Importance': feat_imp})
    feature_importance = feature_importance.sort_values('Importance', ascending=True)
    fig = px.bar(feature_importance, 
                 y='Features', x='Importance',
                 orientation='h', height=800, width=800
                        )
    if save_as:
        pio.write_image(fig, save_as)
    return fig

In [5]:
plot_feature_importance(bl_katja)