In [4]:
import pandas as pd

df = pd.read_parquet('/content/test.parquet')
df.head()

Unnamed: 0,id,dates,values
0,6125,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[1.85, -0.04, 0.19, -0.45, -0.75, -0.95, -2.91..."
1,26781,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-0.41, 0.39, -0.47, -0.9, -1.46, -0.51, 0.51,..."
2,13333,"[2016-06-01, 2016-07-01, 2016-08-01, 2016-09-0...","[-0.29, -1.26, 0.17, -1.22, 0.45, -0.94, 0.16,..."
3,53218,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-1.47, 1.55, -0.03, 0.57, -0.57, 0.6, 0.27, 1..."
4,84204,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[2.33, 1.39, -1.03, -2.64, 1.89, 1.77, 1.43, 1..."


In [5]:
import numpy as np
from scipy.stats import skew, kurtosis
from statsmodels.tsa.stattools import acf
from scipy.signal import find_peaks

df['dates_len'] = df['dates'].apply(len)
df['min_date'] = df['dates'].apply(min)
df['max_date'] = df['dates'].apply(max)
df['period_days'] = (df['max_date'] - df['min_date']).apply(lambda x: x.days)
df['values_mean'] = df['values'].apply(np.mean)
df['values_min'] = df['values'].apply(np.min)
df['values_max'] = df['values'].apply(np.max)
df['values_std'] = df['values'].apply(np.std)
df['values_median'] = df['values'].apply(np.median)
df['diff_values'] = df['values'].apply(lambda x: np.diff(x))
df['mean_diff'] = df['diff_values'].apply(np.mean)
df['month'] = df['dates'].apply(lambda x: [d.month for d in x])
df['Q1'] = df['values'].apply(lambda x: np.percentile(x, 25))
df['Q3'] = df['values'].apply(lambda x: np.percentile(x, 75))
df['IQR'] = df['Q3'] - df['Q1']
df['outliers'] = df.apply(lambda row: len([v for v in row['values'] if v < (row['Q1'] - 1.5 * row['IQR']) or v > (row['Q3'] + 1.5 * row['IQR'])]), axis=1)
df['unique_months'] = df['month'].apply(lambda x: len(set(x)))
df['unique_years'] = df['dates'].apply(lambda x: len(set([d.year for d in x])))
df['values_skew'] = df['values'].apply(lambda x: skew(x))
df['values_kurtosis'] = df['values'].apply(lambda x: kurtosis(x))
df['max_diff'] = df['diff_values'].apply(np.max)
df['min_diff'] = df['diff_values'].apply(np.min)
df['values_pct_10'] = df['values'].apply(lambda x: np.percentile(x, 10))
df['values_pct_90'] = df['values'].apply(lambda x: np.percentile(x, 90))
df['fft_values'] = df['values'].apply(lambda x: np.abs(np.fft.fft(x)))
df['fft_max'] = df['fft_values'].apply(np.max)
df['fft_mean'] = df['fft_values'].apply(np.mean)
df['fft_std'] = df['fft_values'].apply(np.std)
df['autocorr_lag1'] = df['values'].apply(lambda x: acf(x, nlags=1)[1])
df['autocorr_lag2'] = df['values'].apply(lambda x: acf(x, nlags=2)[2] if len(x) > 2 else np.nan)
df['n_peaks'] = df['values'].apply(lambda x: len(find_peaks(x)[0]))

In [6]:
x = df[['dates_len', 'period_days',
       'values_mean', 'values_min', 'values_max', 'values_std',
       'values_median', 'mean_diff', 'Q3', 'IQR',
       'outliers', 'unique_months', 'unique_years', 'values_skew',
       'values_kurtosis', 'max_diff', 'min_diff', 'values_pct_10',
       'values_pct_90', 'fft_max', 'fft_mean', 'fft_std',
       'autocorr_lag1', 'autocorr_lag2', 'n_peaks']]
ids = df['id']

In [7]:
import joblib

loaded_model = joblib.load('/content/best_lgbm_model.pkl')

predictions = loaded_model.predict_proba(x)

In [8]:
result = pd.DataFrame({'id': ids, 'score': predictions[:, 1]})

In [9]:
result

Unnamed: 0,id,score
0,6125,0.409099
1,26781,0.109101
2,13333,0.128626
3,53218,0.076299
4,84204,0.591035
...,...,...
19995,80341,0.612555
19996,5891,0.100923
19997,29091,0.063109
19998,85877,0.490399


In [10]:
result.to_csv('result.csv', index=False)