In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, classification_report,mean_absolute_error, mean_squared_error

pd.set_option('display.max_columns', 20)
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('/Users/clara/Desktop/neuefische/d-drivers/data/data_features.csv')

In [3]:
df.dropna(axis=0, how='any', inplace=True)
df.drop(['page_id','url','video_play', 'page_impressions', 'clickouts','last_author','date_scraped','scraped_word_count','meta_title', 'meta_description',
         'abstract','meta_image_url','page_img_size', 'merged_url','last_publish_date', 'page_name', 'title','h1'], axis=1, inplace= True)
df['publish_date_min'] = pd.to_datetime(df['publish_date_min'])

In [4]:
# Filter columns of a specific data type (e.g., numerical columns)
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Filter columns of a specific data type (e.g., categorical columns)
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Filter columns of a specific data type (e.g., datetime columns)
datetime_columns = df.select_dtypes(include=['datetime']).columns.tolist()

# Combine numerical and categorical features
df_feat = df[numeric_columns + categorical_columns]
df_feat.drop(['external_clicks','external_impressions','ctr'],axis=1,inplace=True)

# One-hot encode all categorical features
df_enc = pd.get_dummies(df_feat, columns=categorical_columns, drop_first=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feat.drop(['external_clicks','external_impressions','ctr'],axis=1,inplace=True)


In [5]:
def lin_reg_evaluation(X, y, test_size=0.3, random_state=25):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Fit the linear regression model
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    # Predictions on training and testing sets
    y_pred_train = lin_reg.predict(X_train)
    y_pred_test = lin_reg.predict(X_test)


    # Evaluation metrics
    print("R-squared (Train):", r2_score(y_train, y_pred_train).round(3))
    print("R-squared (Test):", r2_score(y_test, y_pred_test).round(3))
    print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_test).round(3))
    print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_test).round(3))
    print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred_test)).round(3))


In [6]:
author_col = df_enc.filter(regex='author', axis=1)
media_col = df_enc.filter(regex='media', axis=1)
product_col = df_enc.filter(regex='classification_product', axis=1)
type_col = df_enc.filter(regex='classification_type', axis=1)

In [7]:
target = ['external_impressions', 'external_clicks', 'ctr']

columns = ['no_versions','n_days','classification_product','classification_type','scraped_author',
           'likes_n_days','dislikes_n_days','scraped_word_count','media_type','meta_title_len','meta_desc_len','h1_len','abstract_len']

X = {"Length related features" : "df[['word_count','meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len']]",
    "All features": "df_enc",
    "Performance metrics" : "df_enc[['likes_n_days','dislikes_n_days']]",
    "EDA identified features": "pd.concat([media_col, author_col, product_col, df_enc[['n_days','no_versions']]], axis=1)"
    }

for key, value in X.items():
    print(f"================ {key} ================")
    features = eval(value) 
    for item in target:
        y = df[item]
        print(f"Evaluating for target: {item}")
        lin_reg_evaluation(features, y)
        print()

Evaluating for target: external_impressions
R-squared (Train): 0.02
R-squared (Test): 0.031
Mean Absolute Error (MAE): 175121.075
Mean Squared Error (MSE): 137918101329.924
Root Mean Squared Error (RMSE): 371373.264

Evaluating for target: external_clicks
R-squared (Train): 0.017
R-squared (Test): 0.024
Mean Absolute Error (MAE): 13895.248
Mean Squared Error (MSE): 832356819.72
Root Mean Squared Error (RMSE): 28850.595

Evaluating for target: ctr
R-squared (Train): 0.036
R-squared (Test): 0.033
Mean Absolute Error (MAE): 2.845
Mean Squared Error (MSE): 16.062
Root Mean Squared Error (RMSE): 4.008

Evaluating for target: external_impressions
R-squared (Train): 0.329
R-squared (Test): 0.306
Mean Absolute Error (MAE): 161108.062
Mean Squared Error (MSE): 98786842451.418
Root Mean Squared Error (RMSE): 314303.742

Evaluating for target: external_clicks
R-squared (Train): 0.274
R-squared (Test): 0.274
Mean Absolute Error (MAE): 13002.527
Mean Squared Error (MSE): 618959338.112
Root Mean Squ

In [8]:
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_colwidth', 20)
df_group = df[['no_versions',
 'n_days',
 #'classification_product',
 'classification_type',
 #'scraped_author',
 'likes_n_days',
 'dislikes_n_days',
 #'media_type',
 'meta_title_len',
 'meta_desc_len',
 'h1_len',
 'abstract_len',
 'external_impressions','external_clicks','ctr']].groupby(by="classification_type").median()
df_group

Unnamed: 0_level_0,no_versions,n_days,likes_n_days,dislikes_n_days,meta_title_len,meta_desc_len,h1_len,abstract_len,external_impressions,external_clicks,ctr
classification_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Deal,1.0,15.5,0.0,0.0,71.0,153.0,73.0,272.0,87566.0,5439.5,6.66396
Erfahrungsbericht,0.0,8.0,0.0,0.0,73.0,153.0,73.0,267.0,60014.0,3712.0,6.211161
Kaufberatung,1.0,12.0,0.0,0.0,69.0,152.0,70.0,281.0,44687.0,2738.0,5.628967
News,0.0,8.0,0.0,0.0,72.0,153.0,72.0,254.0,40209.5,2802.5,6.934135
Ratgeber,0.0,9.0,0.0,0.0,68.0,151.0,71.0,272.0,51269.0,3135.0,6.010019
Test,0.5,11.5,0.0,0.0,73.0,154.0,74.0,281.5,41426.5,3837.0,6.993795
Video,0.0,7.0,0.0,0.0,79.0,149.0,79.0,285.0,9741.0,522.0,5.358793


In [9]:
df_news = df.query("classification_type == 'News'")

R-squared (Train): 0.255
R-squared (Test): 0.173

In [10]:
df_news = df.query("classification_type == 'Ratgeber'")

R-squared (Train): 0.505
R-squared (Test): 0.366

In [11]:
df_news = df.query("classification_type == 'Kaufberatung'")

R-squared (Train): 0.832
R-squared (Test): -0.253^

In [12]:
df_news = df.query("classification_type == 'Deal'")

R-squared (Train): 0.687
R-squared (Test): 0.267

In [13]:
df_news = df.query("classification_type == 'Test'")

R-squared (Train): 1.0
R-squared (Test): -5.307

In [14]:
df_news = df.query("classification_type == 'Erfahrungsbericht'")

R-squared (Train): 0.295
R-squared (Test): -1.895

In [19]:
df_news = df.query("classification_type == 'News'")

R-squared (Train): 1.0
R-squared (Test): nan

In [20]:
def encode_features(df):
    # Filter columns of a specific data type (e.g., numerical columns)
    numeric_columns = df_news.select_dtypes(include=['number']).columns.tolist()

    # Filter columns of a specific data type (e.g., categorical columns)
    categorical_columns = df_news.select_dtypes(include=['object', 'category']).columns.tolist()

    # Filter columns of a specific data type (e.g., datetime columns)
    datetime_columns = df_news.select_dtypes(include=['datetime']).columns.tolist()

    # Combine numerical and categorical features
    df_feat = df_news[numeric_columns + categorical_columns]
    df_feat.drop(['external_clicks','external_impressions','ctr'],axis=1,inplace=True)

    # One-hot encode all categorical features
    df_enc = pd.get_dummies(df_feat, columns=categorical_columns, drop_first=True)
    
    return df_enc

In [21]:
df_enc = encode_features(df_news)

author_col = df_enc.filter(regex='author', axis=1)
media_col = df_enc.filter(regex='media', axis=1)
product_col = df_enc.filter(regex='classification_product', axis=1)
type_col = df_enc.filter(regex='classification_type', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feat.drop(['external_clicks','external_impressions','ctr'],axis=1,inplace=True)


In [22]:
target = ['external_impressions', 'external_clicks', 'ctr']

columns = ['no_versions','n_days','classification_product','scraped_author',
           'likes_n_days','dislikes_n_days','scraped_word_count','media_type','meta_title_len','meta_desc_len','h1_len','abstract_len']

X = {#"Length related features" : "df_enc[['word_count','meta_title_len', 'meta_desc_len', 'h1_len', 'abstract_len']]",
    "All features": "df_enc",
    #"Performance metrics" : "df_enc[['likes_n_days','dislikes_n_days']]",
    #"EDA identified features": "pd.concat([media_col, author_col, product_col, df_news['n_days']], axis=1)"
    }

for key, value in X.items():
    print(f"================ {key} ================")
    features = eval(value) 
    for item in target:
        y = df_news[item]
        print(f"Evaluating for target: {item}")
        lin_reg_evaluation(features, y)
        print()

Evaluating for target: external_impressions
R-squared (Train): 0.255
R-squared (Test): 0.173
Mean Absolute Error (MAE): 152468.828
Mean Squared Error (MSE): 105959965005.101
Root Mean Squared Error (RMSE): 325514.923

Evaluating for target: external_clicks
R-squared (Train): 0.227
R-squared (Test): 0.141
Mean Absolute Error (MAE): 12812.222
Mean Squared Error (MSE): 744446817.953
Root Mean Squared Error (RMSE): 27284.553

Evaluating for target: ctr
R-squared (Train): 0.12
R-squared (Test): 0.036
Mean Absolute Error (MAE): 2.854
Mean Squared Error (MSE): 15.791
Root Mean Squared Error (RMSE): 3.974

