In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Preamble: Environment Check

import sys
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns

print("Python version:", sys.version)
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)
print("Matplotlib version:", matplotlib.__version__)
print("Seaborn version:", sns.__version__)

Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Pandas version: 2.2.2
NumPy version: 2.0.2
Matplotlib version: 3.10.0
Seaborn version: 0.13.2


In [3]:
df1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/OnlineNewsPopularity.csv')
df1

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.100000,0.70,-0.350000,-0.600,-0.200000,0.500000,-0.187500,0.000000,0.187500,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.70,-0.118750,-0.125,-0.100000,0.000000,0.000000,0.500000,0.000000,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.575130,1.0,0.663866,3.0,1.0,1.0,...,0.100000,1.00,-0.466667,-0.800,-0.133333,0.000000,0.000000,0.500000,0.000000,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.80,-0.369697,-0.600,-0.166667,0.000000,0.000000,0.500000,0.000000,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.540890,19.0,19.0,20.0,...,0.033333,1.00,-0.220192,-0.500,-0.050000,0.454545,0.136364,0.045455,0.136364,505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39639,http://mashable.com/2014/12/27/samsung-app-aut...,8.0,11.0,346.0,0.529052,1.0,0.684783,9.0,7.0,1.0,...,0.100000,0.75,-0.260000,-0.500,-0.125000,0.100000,0.000000,0.400000,0.000000,1800
39640,http://mashable.com/2014/12/27/seth-rogen-jame...,8.0,12.0,328.0,0.696296,1.0,0.885057,9.0,7.0,3.0,...,0.136364,0.70,-0.211111,-0.400,-0.100000,0.300000,1.000000,0.200000,1.000000,1900
39641,http://mashable.com/2014/12/27/son-pays-off-mo...,8.0,10.0,442.0,0.516355,1.0,0.644128,24.0,1.0,12.0,...,0.136364,0.50,-0.356439,-0.800,-0.166667,0.454545,0.136364,0.045455,0.136364,1900
39642,http://mashable.com/2014/12/27/ukraine-blasts/,8.0,6.0,682.0,0.539493,1.0,0.692661,10.0,1.0,1.0,...,0.062500,0.50,-0.205246,-0.500,-0.012500,0.000000,0.000000,0.500000,0.000000,1100


## Data preprocessing

### Subtask:
Prepare the data for modeling, which may include handling outliers, feature scaling, and encoding categorical variables.


**Reasoning**:
Apply logarithmic transformation to the 'shares' column, identify numerical and categorical columns, apply standard scaling to numerical features, apply one-hot encoding to categorical features, and concatenate the processed features into a new DataFrame.



In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd

# 1. Handle outliers in the 'shares' column by applying a logarithmic transformation.
df1['shares_log'] = np.log1p(df1['shares'])

# 2. Identify the numerical and categorical columns (excluding 'url' and the original 'shares' column).
# Make a copy to avoid modifying the list during iteration if it were iterated on later
numerical_cols = df1.select_dtypes(include=np.number).columns.tolist()
if 'shares' in numerical_cols:
    numerical_cols.remove('shares')
if 'shares_log' in numerical_cols:
    numerical_cols.remove('shares_log')

categorical_cols = df1.select_dtypes(include='object').columns.tolist()
# Remove 'url' from the list of categorical columns to be one-hot encoded,
# as it's typically an identifier and not a feature for OHE.
# It will be handled by remainder='passthrough' and then explicitly dropped.
if 'url' in categorical_cols:
    categorical_cols.remove('url')


# Manually fit the OneHotEncoder to get feature names (even if categorical_cols is empty, this pattern is fine)
ohe = OneHotEncoder(handle_unknown='ignore')
if categorical_cols: # Only fit if there are actual categorical columns
    ohe.fit(df1[categorical_cols])
    categorical_feature_names = ohe.get_feature_names_out(categorical_cols)
else:
    categorical_feature_names = np.array([])


# Create preprocessor to scale numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough' # Keep other columns (like 'url', original 'shares', 'shares_log')
)

# Apply preprocessing: fit and transform the data
processed_data = preprocessor.fit_transform(df1)

# 6. Construct the list of all feature names after preprocessing.
transformed_feature_names = preprocessor.get_feature_names_out()
all_column_names = list(transformed_feature_names)

# 7. Create a new DataFrame from the transformed data and the combined feature names.
processed_df = pd.DataFrame(processed_data, columns=all_column_names)

# --- FIX FOR DTYPE ISSUE AND UNWANTED PASSTHROUGH COLUMNS ---
# The 'remainder__shares_log' column might have been cast to 'object' dtype due to 'remainder__url' being a string.
# We need to ensure 'remainder__shares_log' is numeric and drop the 'url' and original 'shares' columns.

# Convert 'remainder__shares_log' to numeric type
if 'remainder__shares_log' in processed_df.columns:
    processed_df['remainder__shares_log'] = pd.to_numeric(processed_df['remainder__shares_log'])

# Drop other 'remainder__' columns that are not 'remainder__shares_log' (e.g., 'remainder__url', 'remainder__shares').
passthrough_cols_to_drop_final = [col for col in processed_df.columns if col.startswith('remainder__') and col != 'remainder__shares_log']
processed_df = processed_df.drop(columns=passthrough_cols_to_drop_final, errors='ignore')

# 8. Display the head of the resulting processed DataFrame.
display(processed_df.head())

Unnamed: 0,num__timedelta,num__n_tokens_title,num__n_tokens_content,num__n_unique_tokens,num__n_non_stop_words,num__n_non_stop_unique_tokens,num__num_hrefs,num__num_self_hrefs,num__num_imgs,num__num_videos,...,num__min_positive_polarity,num__max_positive_polarity,num__avg_negative_polarity,num__min_negative_polarity,num__max_negative_polarity,num__title_subjectivity,num__title_sentiment_polarity,num__abs_title_subjectivity,num__abs_title_sentiment_polarity,remainder__shares_log
0,1.75788,0.757447,-0.69521,0.032772,0.000675,0.038658,-0.607463,-0.335566,-0.426526,-0.304268,...,0.063865,-0.228941,-0.708369,-0.268895,-0.969886,0.671245,-0.975432,-1.810719,0.13892,6.386879
1,1.75788,-0.661657,-0.618794,0.016056,0.000675,0.031479,-0.695709,-0.594963,-0.426526,-0.304268,...,-0.870968,-0.228941,1.102174,1.367424,0.078642,-0.870807,-0.269076,0.837749,-0.689658,6.568078
2,1.75788,-0.661657,-0.712192,0.007645,0.000675,-0.007752,-0.695709,-0.594963,-0.426526,-0.304268,...,0.063865,0.981798,-1.621797,-0.957871,-0.270867,-0.870807,-0.269076,0.837749,-0.689658,7.313887
3,1.75788,-0.661657,-0.032933,-0.012619,0.000675,-0.007211,-0.166229,-0.85436,-0.426526,-0.304268,...,0.573773,0.174639,-0.862584,-0.268895,-0.620377,-0.870807,-0.269076,0.837749,-0.689658,7.09091
4,1.75788,1.230482,1.115439,-0.037655,0.000675,-0.04542,0.716237,4.074185,1.860061,-0.304268,...,-0.870968,0.981798,0.307944,0.075594,0.602906,0.531059,0.244637,-1.569949,-0.087056,6.226537


## Feature selection/engineering

### Subtask:
Based on the EDA and domain knowledge, select the most relevant features or create new ones to improve model performance.


**Reasoning**:
Select the features from `processed_df` that are most relevant for predicting 'shares_log' based on the EDA findings and create a new DataFrame with these selected features and the target variable.



In [14]:
# Based on EDA, features with some correlation or noticeable differences in box plots include:
# Numerical: 'num_hrefs', 'num_imgs', 'num_videos', 'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess'
# Categorical: 'data_channel_is_*', 'weekday_is_*', 'is_weekend'

# We need to find the corresponding column names in the processed_df
selected_features = ['remainder__shares_log',
                     'num__num_hrefs', 'num__num_imgs', 'num__num_videos',
                     'num__kw_avg_avg', 'num__self_reference_min_shares',
                     'num__self_reference_max_shares', 'num__self_reference_avg_sharess']

# Add processed categorical features. We can include all of them since there are not too many.
categorical_processed_cols = [col for col in processed_df.columns if 'cat__' in col]
selected_features.extend(categorical_processed_cols)

# Create a new DataFrame with the selected features and the target variable
df_selected = processed_df[selected_features]

# Display the first few rows of the selected DataFrame
display(df_selected.head())

Unnamed: 0,remainder__shares_log,num__num_hrefs,num__num_imgs,num__num_videos,num__kw_avg_avg,num__self_reference_min_shares,num__self_reference_max_shares,num__self_reference_avg_sharess
0,6.386879,-0.607463,-0.426526,-0.304268,-2.379014,-0.177459,-0.239676,-0.243926
1,6.568078,-0.695709,-0.426526,-0.304268,-2.379014,-0.202587,-0.251766,-0.264412
2,7.313887,-0.695709,-0.426526,-0.304268,-2.379014,-0.156079,-0.22939,-0.226496
3,7.09091,-0.166229,-0.426526,-0.304268,-2.379014,-0.202587,-0.251766,-0.264412
4,6.226537,0.716237,1.860061,-0.304268,-2.379014,-0.174976,0.138221,-0.134259
