# Feature Engineering

### 1. Library Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import category_encoders as ce
import warnings

warnings.filterwarnings("ignore")

### Load the dataset and take a 100k sample for efficiency

In [2]:
df_original = pd.read_parquet("data_usage_production.parquet")
df = df_original.sample(n=100000, random_state=57)

### Data Preparation and Splitting

In [3]:
df.set_index("telephone_number", inplace=True)

X = df.drop("data_compl_usg_local_m1", axis=1)
y = df["data_compl_usg_local_m1"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=57)

### Preprocessing Pipeline Construction

In [4]:
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

numerical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("transformer", PowerTransformer(method="yeo-johnson")),
        ("scaler", RobustScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", ce.CatBoostEncoder(handle_unknown="value")),
    ]
)

### Model Training

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", RandomForestRegressor(random_state=57, n_jobs=-1)),
    ]
)

model.fit(X_train, y_train)

### Evaluation

In [6]:
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R-squared: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

R-squared: 0.6855317949074238
Mean Squared Error: 15744705.304443616
Mean Absolute Error: 1104.237412415
