In [2]:
import pandas as pd

# create a series of datetime with a frequency of 10 hours
s = pd.date_range('2020-01-06', '2020-01-10', freq='10h').to_series() 

# create some features based on datetime 

features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.isocalendar().week.values 
}

In [3]:
def generate_features(df):
    # create a bunch of features using the date column
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.weekofyear
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)

    # create an aggregate dictionary
    aggs = {}
    # for aggregation by month, we calculate the
    # number of unique month values and also the mean
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    # we aggregate by num1 and calculate sum, max, min
    # and mean values of this column
    aggs['num1'] = ['sum','max','min','mean']
    # for customer_id (count + unique count)
    aggs['customer_id'] = ['size', 'nunique']

    # we group by customer_id and calculate the aggregates
    agg_df = df.groupby('customer_id').agg(aggs).reset_index() 
    return agg_df

In [4]:
x = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

In [5]:
import numpy as np
feature_dict = {}

# calculate mean
feature_dict['mean'] = np.mean(x)

# calculate max
feature_dict['max'] = np.max(x)

# calculate min
feature_dict['min'] = np.min(x)

# calculate standard deviation
feature_dict['std'] = np.std(x)

# calculate variance
feature_dict['var'] = np.var(x)

# peak-to-peak
feature_dict['ptp'] = np.ptp(x)

# percentile features
feature_dict['percentile_10'] = np.percentile(x, 10)
feature_dict['percentile_60'] = np.percentile(x, 60)
feature_dict['percentile_90'] = np.percentile(x, 90)

# quantile features
feature_dict['quantile_5'] = np.quantile(x, 0.05)
feature_dict['quantile_95'] = np.quantile(x, 0.95)
feature_dict['quantile_99'] = np.quantile(x, 0.99)

In [6]:
from tsfresh.feature_extraction import feature_calculators as fc

# tsfresh based features
feature_dict['abs_energy'] = fc.abs_energy(x)
feature_dict['count_above_mean'] = fc.count_above_mean(x)
feature_dict['count_below_mean'] = fc.count_below_mean(x)
feature_dict['mean_abs_change'] = fc.mean_abs_change(x)
feature_dict['mean_change'] = fc.mean_change(x)

In [7]:
import numpy as np
# generate a random dataframe with
# 2 columns and 100 rows
df = pd.DataFrame(
    np.random.rand(100, 2),
    columns=[f"f_{i}" for i in range(1, 3)]
)

In [8]:
df.head()

Unnamed: 0,f_1,f_2
0,0.304845,0.707643
1,0.266681,0.190058
2,0.705353,0.229466
3,0.694011,0.745163
4,0.529763,0.800369


In [10]:
from sklearn import preprocessing
import pandas as pd

# Initialize PolynomialFeatures object
pf = preprocessing.PolynomialFeatures(
    degree=2,
    interaction_only=False,
    include_bias=False
)

# Fit and transform the data
poly_feats = pf.fit_transform(df)

# Get readable feature names
feature_names = pf.get_feature_names_out(df.columns)

# Create DataFrame with meaningful names
df_transformed = pd.DataFrame(poly_feats, columns=feature_names)


In [11]:
df_transformed.head()

Unnamed: 0,f_1,f_2,f_1^2,f_1 f_2,f_2^2
0,0.304845,0.707643,0.09293,0.215721,0.500758
1,0.266681,0.190058,0.071119,0.050685,0.036122
2,0.705353,0.229466,0.497522,0.161854,0.052655
3,0.694011,0.745163,0.481651,0.517151,0.555268
4,0.529763,0.800369,0.280649,0.424006,0.640591


### Binning

Binning enables you to treat numerical features as categorical.

In [13]:
# create bins of the numerical columns 
# 10 bins
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
# 100 bins
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)

In [14]:
df.head()

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.304845,0.707643,2,27
1,0.266681,0.190058,2,23
2,0.705353,0.229466,6,69
3,0.694011,0.745163,6,68
4,0.529763,0.800369,5,51


### Log Transformation

Applying the logarithm to data to reduce skewness, control large values, and make patterns easier to analyze.

### Missing Values 

If you ever encounter missing values in categorical features, treat is as a new category! 

As for numberical values, there are many ways like filling with 0, mean or median. 

A fancy way to fill numerical missing values is to use k-nearest neighbour method

#### KNN Imputer

In [15]:
import numpy as np
from sklearn import impute

# create a random numpy array with 10 samples
# and 6 features and values ranging from 1 to 15
X = np.random.randint(1, 15, (10, 6))

# convert the array to float
X = X.astype(float)

# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

# use 2 nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

array([[14. ,  5. ,  1. ,  6. , 14. ,  3.5],
       [ 4.5,  7. , 10. , 14. , 11. ,  7. ],
       [ 2. ,  5. ,  4. ,  2. , 12. ,  7. ],
       [ 9.5,  7. ,  2. , 13. ,  6. ,  2. ],
       [ 5. ,  2. ,  1. , 14. , 11. ,  4.5],
       [ 4. , 12. ,  5. , 10. ,  9. , 14. ],
       [10. ,  5. ,  2. ,  1. ,  8. ,  5. ],
       [ 4. ,  8. ,  9.5,  6. ,  5. , 11. ],
       [10. , 11. , 14. ,  3.5,  4. ,  7. ],
       [ 7. ,  5. , 13. ,  1. ,  2. ,  3. ]])

- Another way of imputing missing values in a column would be to train a regression
model that tries to predict missing values in a column based on other columns

- Imputing values for tree-based models is unnecessary as they
can handle it themselves.

**NOTE**: Always remember to scale or normalize your
features if you are using linear models like logistic regression or a model like SVM.
Tree-based models will always work fine without any normalization of features.