# Feature Engineering in Python for Data Science

Feature engineering is the process of creating new features or transforming existing ones to improve model performance. Here's a comprehensive guide with examples:

### 1. Basic Feature Transformations



In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Create sample dataset
data = {
    'age': [25, 35, 45, 55, 65],
    'income': [30000, 45000, 55000, 80000, 100000],
    'education': ['High School', 'Bachelors', 'Masters', 'PhD', 'Bachelors'],
    'purchase_date': pd.date_range(start='2023-01-01', periods=5)
}

df = pd.DataFrame(data)

# Numerical Feature Scaling
scaler = StandardScaler()
df['age_scaled'] = scaler.fit_transform(df[['age']])
df['income_scaled'] = scaler.fit_transform(df[['income']])

# Categorical Encoding
le = LabelEncoder()
df['education_encoded'] = le.fit_transform(df['education'])



### 2. Date-based Feature Engineering



In [None]:
# Extract various date components
df['purchase_year'] = df['purchase_date'].dt.year
df['purchase_month'] = df['purchase_date'].dt.month
df['purchase_day'] = df['purchase_date'].dt.day
df['purchase_dayofweek'] = df['purchase_date'].dt.dayofweek
df['purchase_quarter'] = df['purchase_date'].dt.quarter

# Create seasonal features
df['is_weekend'] = df['purchase_dayofweek'].isin([5, 6]).astype(int)
df['is_month_end'] = df['purchase_date'].dt.is_month_end.astype(int)



### 3. Binning and Discretization



In [None]:
# Equal-width binning
df['age_bins'] = pd.cut(df['age'], 
                       bins=3, 
                       labels=['Young', 'Middle', 'Senior'])

# Equal-frequency binning (quantiles)
df['income_quartiles'] = pd.qcut(df['income'], 
                                q=4, 
                                labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

# Custom binning
custom_bins = [0, 30000, 60000, 100000, float('inf')]
custom_labels = ['Low', 'Medium', 'High', 'Very High']
df['income_custom_bins'] = pd.cut(df['income'], 
                                 bins=custom_bins, 
                                 labels=custom_labels)



### 4. Interaction Features



In [None]:
# Creating interaction features
df['age_income_interaction'] = df['age'] * df['income']
df['age_income_ratio'] = df['age'] / df['income']

# Polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['age', 'income']])
poly_features_df = pd.DataFrame(poly_features, 
                              columns=['age', 'income', 'age²', 'age*income', 'income²'])



### 5. Text Feature Engineering



In [None]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample text data
texts = [
    "This is a good product",
    "The service was terrible",
    "Amazing experience overall"
]

# Bag of Words
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)
bow_df = pd.DataFrame(bow_matrix.toarray(), 
                     columns=vectorizer.get_feature_names_out())

# TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                       columns=tfidf.get_feature_names_out())



### 6. Advanced Feature Engineering



In [None]:
# Aggregation features
def create_agg_features(df, group_col, agg_col):
    """Create aggregation features for a given column"""
    agg_funcs = ['mean', 'min', 'max', 'std']
    
    agg_features = df.groupby(group_col)[agg_col].agg(agg_funcs)
    agg_features.columns = [f'{agg_col}_{func}' for func in agg_funcs]
    
    return agg_features.reset_index()

# Time-based features
def create_time_features(df, date_col):
    """Create time-based features from a date column"""
    df[f'{date_col}_hour'] = df[date_col].dt.hour
    df[f'{date_col}_minute'] = df[date_col].dt.minute
    df[f'{date_col}_is_business_hour'] = df[date_col].dt.hour.between(9, 17).astype(int)
    
    return df



### 7. Feature Selection



In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Select top k features based on ANOVA F-value
def select_features(X, y, k=5):
    """Select top k features using ANOVA F-value"""
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    
    return X_selected, selected_features



### Key Points to Remember:

1. **Feature Types**
- Numerical features need scaling
- Categorical features need encoding
- Date features can be decomposed
- Text features need vectorization

2. **Best Practices**
- Always scale features before modeling
- Handle missing values appropriately
- Document feature engineering steps
- Validate new features' importance

3. **Common Techniques**
- Scaling and normalization
- Binning and discretization
- Feature interactions
- Polynomial features
- Aggregations
- Time-based features

4. **Saving Engineered Features**


In [None]:
# Save the processed dataset
df.to_csv('engineered_features.csv', index=False)

# Save feature engineering pipeline
from sklearn.pipeline import Pipeline
from joblib import dump

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    # Add more steps as needed
])

dump(pipeline, 'feature_engineering_pipeline.joblib')



This comprehensive approach to feature engineering helps prepare your data for machine learning models and can significantly improve model performance.