<a href="https://colab.research.google.com/github/Ron-levi1/Social-Media-Advertisement-Performance/blob/main/part_5_Feature_Engineering_%26_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd


df = pd.read_csv('/content/drive/MyDrive/df_after_encoding.csv')

print(df.shape)
df.head()

Mounted at /content/drive
(400000, 59)


Unnamed: 0,event_id,ad_id,timestamp,event_type,campaign_id,target_interests,name,start_date,end_date,duration_days,...,target_interests_art,target_interests_health,target_interests_sports,target_interests_gaming,target_interests_lifestyle,target_interests_finance,target_interests_food,target_interests_travel,target_interests_fitness,target_interests_technology
0,65,1,2025-06-28 10:56:52,0,28,"['art', 'technology']",Campaign_28_Winter,2025-05-09,2025-06-30,52,...,1,0,0,0,0,0,0,0,0,1
1,228,1,2025-07-11 22:34:14,0,28,"['art', 'technology']",Campaign_28_Winter,2025-05-09,2025-06-30,52,...,1,0,0,0,0,0,0,0,0,1
2,391,1,2025-05-21 08:41:59,0,28,"['art', 'technology']",Campaign_28_Winter,2025-05-09,2025-06-30,52,...,1,0,0,0,0,0,0,0,0,1
3,491,1,2025-06-26 14:32:05,0,28,"['art', 'technology']",Campaign_28_Winter,2025-05-09,2025-06-30,52,...,1,0,0,0,0,0,0,0,0,1
4,572,1,2025-05-19 05:44:09,0,28,"['art', 'technology']",Campaign_28_Winter,2025-05-09,2025-06-30,52,...,1,0,0,0,0,0,0,0,0,1


###### Creating a new feature daily_budget by dividing the total campaign budget by its duration to capture the average daily investment level.

In [2]:
import numpy as np
df['daily_budget'] = df['total_budget'] / df['duration_days']
df['daily_budget'] = df['daily_budget'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 60 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   event_id                      400000 non-null  int64  
 1   ad_id                         400000 non-null  int64  
 2   timestamp                     400000 non-null  object 
 3   event_type                    400000 non-null  int64  
 4   campaign_id                   400000 non-null  int64  
 5   target_interests              400000 non-null  object 
 6   name                          400000 non-null  object 
 7   start_date                    400000 non-null  object 
 8   end_date                      400000 non-null  object 
 9   duration_days                 400000 non-null  int64  
 10  total_budget                  400000 non-null  float64
 11  user_age                      400000 non-null  int64  
 12  country                       400000 non-nul

###### Creating a choropleth map to visualize the geographical distribution of Event Type = 1 occurrences across countries, where darker shades indicate higher event frequencies.

In [3]:
df['country'].unique()

array(['United States', 'United Kingdom', 'Australia', 'France', 'Canada',
       'Germany', 'Brazil', 'Japan', 'India', 'Mexico'], dtype=object)

In [4]:
!pip install pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [5]:
import plotly.express as px
import pycountry

# Map country names to ISO-3 for consistent geographic identification
def get_country_code(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except:
        return None

# Aggregate Event Type = 1 per country
event1_counts = (
    df[df["event_type"] == 1]
      .groupby("country", as_index=False)
      .size()
      .rename(columns={"size": "event_count"})
)

# Add ISO-3 codes and drop rows that could not be mapped
event1_counts["iso3"] = event1_counts["country"].apply(get_country_code)
event1_counts = event1_counts.dropna(subset=["iso3"])

# Build a light, flat choropleth
fig = px.choropleth(
    event1_counts,
    locations="iso3",
    color="event_count",
    hover_name="country",
    title="Event Type = 1, Count by Country",
    color_continuous_scale="Reds"   # light-friendly palette
)

# Light theme, larger size, flat projection
fig.update_layout(
    width=1100, height=600,
    title_font=dict(size=20),
    title_x=0.5,
    paper_bgcolor="white",
    plot_bgcolor="white",
    margin=dict(l=10, r=10, t=60, b=10),
    coloraxis_colorbar=dict(title="Event Count")
)

# Flat world view with coastlines and country borders
fig.update_geos(
    projection_type="equirectangular",  # flat world
    showcountries=True, countrycolor="rgba(0,0,0,0.25)",
    showcoastlines=True, coastlinecolor="rgba(0,0,0,0.25)",
    showland=True, landcolor="rgba(240,240,240,0.5)",
    showocean=True, oceancolor="rgba(220,230,250,0.5)",
    showlakes=False, showrivers=False
)

# Clean hover template
fig.update_traces(
    hovertemplate="<b>%{hovertext}</b><br>Events: %{z:,}<extra></extra>"
)

fig.show()

import plotly.io as pio
pio.write_html(fig, file="event_type1_map.html", include_plotlyjs="cdn", auto_open=False)


#### Feature Selection

###### Performing feature selection using multiple models (Lasso, Ridge, Gradient Boosting, and Random Forest) to identify and compare the most predictive features influencing the target variable.


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# Target variable (already binary: 0 = Impression, 1 = Other Events)
y = df["event_type"]


X = df.drop(columns=[
    "event_type", "timestamp", "start_date", "end_date",
    "target_interests", "interests", "interests_list", "name", "country"
])


# --- STANDARDIZATION ---
# Scaling all numeric features to have mean = 0 and standard deviation = 1, while preserving column names
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns  # keep original feature names
)

# Fitting multiple models to identify which features contribute most to predicting the target
lasso = Lasso(alpha=5).fit(X_scaled, y)                      # Lasso penalizes less important features
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

ridge = Ridge(alpha=5).fit(X_scaled, y)                      # Ridge evaluates overall weight importance
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

gb = GradientBoostingRegressor().fit(X_scaled, y)            # Gradient Boosting ranks features by importance
gb_selected = (gb.feature_importances_ > 0).astype(int)

rf = RandomForestRegressor().fit(X_scaled, y)                # Random Forest measures feature contribution via splits
rf_selected = (rf.feature_importances_ > 0).astype(int)

# Creating summary table showing which features were selected by each model
selection_df = pd.DataFrame({
    'Feature': X_scaled.columns,
    'Lasso': lasso_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)

# Output the results
print(selection_df)


                         Feature  Lasso  GradientBoost  RandomForest  Ridge  \
0                       event_id      0              1             1      1   
1                          ad_id      0              1             1      1   
2                    campaign_id      0              1             1      1   
3                  duration_days      0              1             1      1   
4                   total_budget      0              1             1      1   
5                       user_age      0              1             1      1   
6                           year      0              0             0      0   
7                          month      0              1             1      1   
8                            day      0              1             1      1   
9                           hour      0              1             1      1   
10                        minute      0              1             1      1   
11                        second      0             

###### Checking how many features received the highest score (Sum = 3) across all models and filtering out features with lower scores to focus on the most consistently important predictors.

In [None]:
# Count how many features received the maximum score (Sum = 3)
count_selected = selection_df[selection_df["Sum"] == 3].shape[0]
print("Number of features with Sum = 3:", count_selected)

# List all features with Sum = 3
features_with_sum3 = selection_df.loc[selection_df["Sum"] == 3, "Feature"].tolist()
print("\nFeatures with Sum = 3:")
print(features_with_sum3)

# Filter selection_df to keep only features with Sum = 3
selection_df = selection_df[selection_df["Sum"] == 3]

Number of features with Sum = 3: 41

Features with Sum = 3:
['event_id', 'ad_id', 'campaign_id', 'duration_days', 'total_budget', 'user_age', 'month', 'day', 'hour', 'minute', 'second', 'is_weekend', 'user_id_encoded', 'day_of_week_encoded', 'time_of_day_encoded', 'ad_platform_encoded', 'ad_type_encoded', 'target_gender_encoded', 'location_encoded', 'target_age_group_encoded', 'age_group_encoded', 'user_gender_encoded', 'month_name_encoded', 'user_interest_health', 'user_interest_travel', 'user_interest_art', 'user_interest_lifestyle', 'user_interest_sports', 'user_interest_news', 'user_interest_photography', 'user_interest_gaming', 'user_interest_finance', 'user_interest_food', 'target_interests_fashion', 'target_interests_photography', 'target_interests_health', 'target_interests_sports', 'target_interests_gaming', 'target_interests_travel', 'target_interests_fitness', 'daily_budget']


#### After filtering, 43 features remained with a score of 3 - indicating consistent selection by all models.
#### Since this number is still high, further feature selection will be performed to narrow down the most relevant predictors.

###### Removing non-predictive features such as unique identifiers (IDs) and detailed timestamp components (day, hour, minute, second).
###### These columns do not contain meaningful information for prediction and may introduce noise or unnecessary complexity to the model.

In [None]:
# Remove unique identifiers and non-informative time-related features from the selected list
cols_to_drop = [
    "event_id", "ad_id", "campaign_id", "user_id_encoded",
    "year", "month", "day", "hour", "minute", "second"
]

selection_df = selection_df[~selection_df["Feature"].isin(cols_to_drop)]
print("Remaining features after removing IDs and time-related attributes:", len(selection_df))

Remaining features after removing IDs and time-related attributes: 32


In [None]:
# Calculate how many users have interest=1 in each user_interest_* column
user_interest_cols = [c for c in df.columns if c.startswith('user_interest_')]

interest_counts = df[user_interest_cols].sum().sort_values(ascending=False)
print("User interest frequencies:")
print(interest_counts)

User interest frequencies:
user_interest_fitness        62794
user_interest_lifestyle      62635
user_interest_gaming         62089
user_interest_technology     62045
user_interest_travel         61999
user_interest_art            61944
user_interest_fashion        61296
user_interest_sports         61009
user_interest_health         60809
user_interest_food           60291
user_interest_news           60176
user_interest_photography    59843
user_interest_finance        59507
dtype: int64


###### Counting how many records show alignment between user and target interests for each category to identify where overlap is weakest or strongest.

In [None]:
# Identify user and target interest columns
user_interest_cols = [c for c in df.columns if c.startswith("user_interest_")]
target_interest_cols = [c for c in df.columns if c.startswith("target_interests_")]

# Find common interests (those that appear in both)
common_interests = [
    col.replace("user_interest_", "")
    for col in user_interest_cols
    if col.replace("user_interest_", "") in
       [t.replace("target_interests_", "") for t in target_interest_cols]
]

# Calculate the number of matches (user=1 and target=1) for each interest
for interest in common_interests:
    user_col = f"user_interest_{interest}"
    target_col = f"target_interests_{interest}"
    matches = ((df[user_col] == 1) & (df[target_col] == 1)).sum()
    print(f"{interest.capitalize()}: {matches} matches")


Fitness: 6244 matches
Health: 8647 matches
Fashion: 8108 matches
Travel: 8557 matches
Art: 6547 matches
Technology: 6732 matches
Lifestyle: 6117 matches
Sports: 7256 matches
News: 6940 matches
Photography: 6602 matches
Gaming: 8411 matches
Finance: 8462 matches
Food: 3561 matches


#### The analysis of matches between user and campaign interests reveals varying levels of audience alignment across categories.
###### Domains such as Health, Finance, and Gaming show the highest overlap, indicating that campaigns in these areas are effectively reaching users whose interests align with the targeted themes.
###### Conversely, categories like Food, Lifestyle, and Fitness display the lowest match counts, suggesting a weaker targeting accuracy or a potential mismatch between campaign focus and user preferences.
###### These insights highlight where audience segmentation and targeting strategies are performing well and where refinement may be needed to improve campaign relevance and engagement.

###### Removing the four lowest-overlap interests (Food, Lifestyle, Fitness, and Art), which showed the weakest alignment between user and campaign interests, as they contribute less to predicting user engagement.

In [None]:
# Remove the four interests with the lowest user–target match counts
low_match_interests = ["food", "lifestyle", "fitness", "art"]

cols_to_drop = [f"target_interests_{i}" for i in low_match_interests]

selection_df = selection_df[~selection_df["Feature"].isin(cols_to_drop)]

print("Remaining features after removing lowest-overlap interests:", len(selection_df))

Remaining features after removing lowest-overlap interests: 31


In [None]:
# Display the remaining selected features
remaining_features = selection_df["Feature"].tolist()
print("Remaining features (" + str(len(remaining_features)) + "):")
for f in remaining_features:
    print(f)


Remaining features (31):
duration_days
total_budget
user_age
is_weekend
day_of_week_encoded
time_of_day_encoded
ad_platform_encoded
ad_type_encoded
target_gender_encoded
location_encoded
target_age_group_encoded
age_group_encoded
user_gender_encoded
month_name_encoded
user_interest_health
user_interest_travel
user_interest_art
user_interest_lifestyle
user_interest_sports
user_interest_news
user_interest_photography
user_interest_gaming
user_interest_finance
user_interest_food
target_interests_fashion
target_interests_photography
target_interests_health
target_interests_sports
target_interests_gaming
target_interests_travel
daily_budget


###### Further refining the feature set by removing redundant or less informative attributes. Campaign-level targeting interests are partially overlapping with user interests, and temporal features like month_name_encoded and is_weekend contribute marginally to prediction.
#### The goal is to retain around 20–25 meaningful, non-redundant features for model training.

In [None]:
# Manually remove less informative or redundant features
manual_remove = [
    "is_weekend",              # already represented by day_of_week_encoded
    "month_name_encoded",      # adds minimal new information
    "target_gender_encoded",   # redundant with user_gender_encoded
    "target_age_group_encoded",# overlaps with user_age/user_interest
    "target_interests_fashion",
    "target_interests_photography",
    "target_interests_health",
    "target_interests_sports",
    "target_interests_gaming",
    "target_interests_finance" # campaign-level targeting already reflected by user interests
]

selection_df = selection_df[~selection_df["Feature"].isin(manual_remove)]
print("Remaining features after manual refinement:", len(selection_df))
print(selection_df["Feature"].tolist())


Remaining features after manual refinement: 22
['duration_days', 'total_budget', 'user_age', 'day_of_week_encoded', 'time_of_day_encoded', 'ad_platform_encoded', 'ad_type_encoded', 'location_encoded', 'age_group_encoded', 'user_gender_encoded', 'user_interest_health', 'user_interest_travel', 'user_interest_art', 'user_interest_lifestyle', 'user_interest_sports', 'user_interest_news', 'user_interest_photography', 'user_interest_gaming', 'user_interest_finance', 'user_interest_food', 'target_interests_travel', 'daily_budget']


###### Saving the final feature-selected dataset (23 columns) into a new DataFrame for model training in the next stage (Model Selection and Fine Tuning).

In [None]:
# Save the final feature-selected dataset
df_after_lasso = df[selection_df["Feature"].tolist() + ["event_type"]]

# Verify shape and preview
print("Shape of df_after_lasso:", df_after_lasso.shape)
df_after_lasso.head()

Shape of df_after_lasso: (400000, 23)


Unnamed: 0,duration_days,total_budget,user_age,day_of_week_encoded,time_of_day_encoded,ad_platform_encoded,ad_type_encoded,location_encoded,age_group_encoded,user_gender_encoded,...,user_interest_lifestyle,user_interest_sports,user_interest_news,user_interest_photography,user_interest_gaming,user_interest_finance,user_interest_food,target_interests_travel,daily_budget,event_type
0,52,32844.79,37,2,2,0,3,2434,3,1,...,0,0,0,0,0,0,0,0,631.630577,0
1,52,32844.79,28,0,1,0,3,1818,2,1,...,0,0,0,1,0,0,0,0,631.630577,0
2,52,32844.79,55,6,2,0,3,3451,5,0,...,0,0,1,1,0,0,0,0,631.630577,0
3,52,32844.79,40,4,0,0,3,362,3,1,...,1,0,0,0,0,0,0,0,631.630577,0
4,52,32844.79,17,1,3,0,3,4363,0,0,...,0,0,0,0,0,0,0,0,631.630577,0


In [None]:
df_after_lasso.to_csv("/content/drive/MyDrive/df_after_lasso.csv", index=False)