In [None]:
import pandas as pd

s = pd.date_range('2020-01-01', '2020-01-10', freq='10h').to_series()

features = {
    'dayofweek': s.dt.dayofweek.values,
    'dayofyear': s.dt.dayofyear.values,
    'hour': s.dt.hour.values,
    'is_leap_year': s.dt.is_leap_year.values,
    'quarter': s.dt.quarter.values,
    'weekofyear': s.dt.weekofyear.values
}

In [1]:
def generate_features(df):
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.weekofyear
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >= 5).astype(int)

    aggs = {}
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    aggs['num1'] = ['sum', 'max', 'min', 'mean']
    aggs['customer_id'] = ['size']
    aggs['customer_id'] = ['nunique']

    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df = agg_df.reset_index()
    return agg_df

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

# データフレームを作成する
data = {'customer_id': ['A', 'A', 'B', 'B', 'C', 'C'],
        'date': ['2022-01-01', '2022-02-01', '2022-01-01', '2022-02-01', '2022-01-01', '2022-02-01'],
        'month': [1, 2, 1, 2, 1, 2],
        'num1': [10, 20, 30, 40, 50, 60]}
df = pd.DataFrame(data)

# 'date'列を日付型に変換する
df['date'] = pd.to_datetime(df['date'])

# generate_features関数を呼び出して特徴量を生成する
features = generate_features(df)

# 結果を表示する
print(features)


  customer_id   month      weekofyear       num1              customer_id
              nunique mean    nunique  mean  sum max min mean     nunique
0           A       2  1.5          2  28.5   30  20  10   15           1
1           B       2  1.5          2  28.5   70  40  30   35           1
2           C       2  1.5          2  28.5  110  60  50   55           1


In [4]:
import numpy as np

df = pd.DataFrame(
    np.random.rand(100,2),
    columns=[f"f_{i}" for i in range(1,3)]
)

In [5]:
from sklearn import preprocessing

pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
pf.fit(df)
poly_feats = pf.transform(df)

num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(poly_feats, columns=[f"f_{i}" for i in range(1, num_feats+1)])

print(df_transformed.head())

        f_1       f_2       f_3       f_4       f_5
0  0.048195  0.460279  0.002323  0.022183  0.211856
1  0.024225  0.460928  0.000587  0.011166  0.212454
2  0.924425  0.550188  0.854562  0.508608  0.302707
3  0.828545  0.274025  0.686487  0.227042  0.075090
4  0.139136  0.602021  0.019359  0.083763  0.362429


In [10]:
import numpy as np
from sklearn import impute

X = np.random.randint(1, 15, (10, 6))
X = X.astype(float)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

print(X)
knn_imputer = impute.KNNImputer(n_neighbors=2)
X = knn_imputer.fit_transform(X)

print(X)


[[ 9.  2.  2.  3. 13.  4.]
 [nan nan nan 12.  5. nan]
 [ 9. 14.  6. 14.  5. 13.]
 [ 4.  5. 13. nan nan  4.]
 [12.  6.  4. 10.  5.  3.]
 [ 4. 13.  9. nan 14.  1.]
 [nan nan  7.  9.  5.  3.]
 [ 4. 13. 12.  5.  9.  7.]
 [11.  6.  1.  1. nan  6.]
 [ 2.  3.  6.  8. 13. 14.]]
[[ 9.   2.   2.   3.  13.   4. ]
 [10.5 10.   5.  12.   5.   8. ]
 [ 9.  14.   6.  14.   5.  13. ]
 [ 4.   5.  13.   7.   7.   4. ]
 [12.   6.   4.  10.   5.   3. ]
 [ 4.  13.   9.   7.  14.   1. ]
 [ 8.   9.5  7.   9.   5.   3. ]
 [ 4.  13.  12.   5.   9.   7. ]
 [11.   6.   1.   1.   9.   6. ]
 [ 2.   3.   6.   8.  13.  14. ]]


In [8]:


knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

print(X)

[[nan  8.  4. 10.  3. nan]
 [ 3. nan  6.  7.  8. 14.]
 [ 4. 10. nan nan  9.  1.]
 [ 4. 12.  2.  8.  3.  2.]
 [ 9. 11. nan 11.  8.  1.]
 [ 6.  7.  6. nan 12. 14.]
 [ 5. 11.  2.  9.  3.  1.]
 [ 8.  4.  9. 14.  8. 11.]
 [nan nan  6. 12.  2.  3.]
 [ 1. 10. 11. nan  8.  3.]]
