In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
data = pd.read_pickle('../data/processed/final_merged_data.pkl')

In [None]:
data["estimated_revenue"].describe()

In [None]:
possible_encoded_columns = data[["Tags", "Categories", "Genres"]]

In [None]:
possible_encoded_columns

In [None]:


def build_top_features(df, column, top_n=30):
    items = (
        df[column]
        .fillna("")   # replaces NaN with empty string                    
        .astype(str)  # ensures all entries are strings                   
        .str.split(",")  # splits the string by commas into lists                
        .apply(lambda x: [i.strip() for i in x if i.strip()])  # applies a function to strip whitespace and remove empty strings
    )

    # Count frequency of each label by exploding, a process that takes a list-like column and creates a new row for each element in the list, then counting occurrences
    freq = items.explode().value_counts()

    # Get top N most frequent labels, top_n is a parameter that specifies how many top items to select
    top_items = list(freq.head(top_n).index)

    # MultiLabelBinarizer is used for multi-label one-hot encoding which is a process of converting categorical variables into a binary matrix
    mlb = MultiLabelBinarizer(classes=top_items)

    # mlb looks like this now: classes_ = [top_item1, top_item2, ..., top_itemN]

    # fit_transform the items is a function that takes the MultiLabelBinarizer and turns the list of labels into a binary matrix
    arr = mlb.fit_transform(items)
    
    # arr looks like this after fit_transform: [[0, 1, 0, ..., 1], [1, 0, 0, ..., 0], ..., [0, 0, 1, ..., 0]]

    # Convert the binary matrix into a DataFrame for easier handling and better readability
    encoded_df = pd.DataFrame(
        arr,
        columns=[f"{column}_{t}" for t in mlb.classes_],
        index=df.index
    )
    
    # encoded_df is a combination of mlb and arr in a DataFrame format: (e.g. columns: Tags_Action , rows: [0, 1, 0, ..., 1])

    return encoded_df


In [None]:
# Build each set of encoded features for Tags, Genres, and Categories
tags_encoded = build_top_features(data, "Tags", top_n=30)
genres_encoded = build_top_features(data, "Genres", top_n=20)
categories_encoded = build_top_features(data, "Categories", top_n=10)

# Combine all encoded features into the dataset
data_encoded = pd.concat(
    [data, tags_encoded, genres_encoded, categories_encoded],
    axis=1
)


In [None]:
data_encoded.columns

In [None]:
# Features to exclude (things that shouldn't be used)
exclude_cols = [
    "app_id", "title", "release", "Categories", "Genres", "Tags", "estimated_revenue", "log_estimated_revenue"
]

# Build feature matrix X
X = data_encoded.drop(columns=exclude_cols, errors="ignore")


In [None]:
X.to_pickle("../data/processed/matrix_ready_for_regression.pkl")