# Feature Engineering

## Step 1: Import necessary libraries

In [36]:
import pandas as pd
import numpy as np

## Step 2: Data Loading

In [37]:
# Load the dataset
df = pd.read_csv("../Dataset/sp500_for_engineering.csv", index_col='Symbol')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 499 entries, AAPL to AMTM
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Industry           499 non-null    object 
 1   Currentprice       499 non-null    float64
 2   City               499 non-null    object 
 3   Fulltimeemployees  490 non-null    float64
 4   Weight             499 non-null    float64
 5   Growth_Category    499 non-null    object 
dtypes: float64(3), object(3)
memory usage: 27.3+ KB


## Step 3: Hanlde Missing Values

In [38]:
# Fill missing values in 'Fulltimeemployees' column with the mean of the respective industry
# which is better than filling with the overall mean  
 
df['Fulltimeemployees'] = df.groupby('Industry')['Fulltimeemployees'].transform(lambda x: x.fillna(x.mean()))

## Step 4: Target Encoding with KFold (Leakage-safe)

In [39]:
def safe_target_encode(df, cat_column, target_column, n_splits=5):
    df = df.copy()
    encoded_column = cat_column + '_te'
    df[encoded_column] = np.nan

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, val_idx in kf.split(df):
        train_df = df.iloc[train_idx]
        val_df = df.iloc[val_idx]

        # Compute mean target for each category from training fold
        means = train_df.groupby(cat_column)[target_column].mean()

        # Map means to validation fold
        df.loc[df.index[val_idx], encoded_column] = val_df[cat_column].map(means)

    # Fill any unknown categories with global mean
    global_mean = df[target_column].mean()
    df[encoded_column].fillna(global_mean, inplace=True)

    return df

In [40]:
category_map = {
    'Declining': 0,
    'Low Growth': 1,
    'Moderate Growth': 2,
    'High Growth': 3,
    'Hyper Growth': 4
}
df['Growth_Category_numeric'] = df['Growth_Category'].map(category_map)

In [41]:
# Encode high-cardinality categorical variables
df = safe_target_encode(df, 'Industry', 'Growth_Category_numeric')
df = safe_target_encode(df, 'City', 'Growth_Category_numeric')

## Step 5: Log-transform numerical columns

In [42]:
df['Log_Currentprice'] = np.log1p(df['Currentprice'])
df['Log_Employees'] = np.log1p(df['Fulltimeemployees'])
df['Price_per_employee'] = df['Currentprice'] / (df['Fulltimeemployees'] + 1)
df['Log_PPE'] = np.log1p(df['Price_per_employee'])

## Step 6: Additional Derived Features

In [43]:
df['Employees_per_million_usd'] = df['Fulltimeemployees'] / (df['Currentprice'] + 1)
df['Is_large_employer'] = (df['Fulltimeemployees'] > df['Fulltimeemployees'].median()).astype(int)
df['Is_expensive_stock'] = (df['Currentprice'] > df['Currentprice'].median()).astype(int)
df['City_Industry_combo'] = df['City_te'] * df['Industry_te']

## Step 7: Save The Data

In [44]:
df.to_csv("../Dataset/sp500_for_train.csv")