In [63]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

In [64]:
test = pd.read_parquet('data/test.parquet')
test.head()

Unnamed: 0,id,dates,values
0,6125,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[1.85, -0.04, 0.19, -0.45, -0.75, -0.95, -2.91..."
1,26781,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-0.41, 0.39, -0.47, -0.9, -1.46, -0.51, 0.51,..."
2,13333,"[2016-06-01, 2016-07-01, 2016-08-01, 2016-09-0...","[-0.29, -1.26, 0.17, -1.22, 0.45, -0.94, 0.16,..."
3,53218,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-1.47, 1.55, -0.03, 0.57, -0.57, 0.6, 0.27, 1..."
4,84204,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[2.33, 1.39, -1.03, -2.64, 1.89, 1.77, 1.43, 1..."


In [65]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)
f.close()

### Преобразование тестового набора данных

In [66]:
def extract_basic_stats(row):
    values = np.array(row['values'])
    return pd.Series({
        'mean': np.mean(values),
        'std': np.std(values),
        'max': np.max(values),
        'min': np.min(values),
        'median': np.median(values),
        'q25': np.percentile(values, 25),
        'q75': np.percentile(values, 75),
        'skewness': skew(values),
        'kurtosis': kurtosis(values)
    })

In [67]:
df_stats = test.apply(extract_basic_stats, axis=1)
df_test = pd.concat([test, df_stats], axis=1)

In [68]:
def extract_fourier_features(row, n_coeffs=5):
    values = np.array(row['values'])
    fourier_coeffs = np.fft.fft(values)[:n_coeffs].real 
    return pd.Series(fourier_coeffs, index=[f'fourier_{i}' for i in range(n_coeffs)])

In [69]:
df_fourier = test.apply(extract_fourier_features, axis=1)
df_test = pd.concat([df_test, df_fourier], axis=1)

In [70]:
def create_lag_features(row, lags=3):
    values = np.array(row['values'])
    lag_features = values[-lags:] if len(values) >= lags else np.pad(values, (lags - len(values), 0), mode='constant')
    return pd.Series(lag_features, index=[f'lag_{i+1}' for i in range(lags)])

In [71]:
df_lags = test.apply(create_lag_features, axis=1)
df_test = pd.concat([df_test, df_lags], axis=1)

In [72]:
def extract_moving_avg_ewm(row, window=3, alpha=0.5):
    values = pd.Series(row['values'])
    moving_avg = values.rolling(window).mean().iloc[-1] if len(values) >= window else np.nan
    ewm = values.ewm(alpha=alpha).mean().iloc[-1]
    return pd.Series({'moving_avg': moving_avg, 'ewm': ewm})

In [73]:
df_smoothing = test.apply(extract_moving_avg_ewm, axis=1)
df_test = pd.concat([df_test, df_smoothing], axis=1)

In [74]:
df_test = df_test.dropna(axis=0, how='any')

In [75]:
X = df_test.drop(columns=['id','dates','values'])

### Получение и сохрание результатов

In [90]:
# y_pred = model.predict(X)
answers_pred = model.predict_proba(X)[:, 1] 
answers_pred.shape

(19979,)

In [98]:
predictions = pd.DataFrame(answers_pred)
predictions.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
predictions = pd.concat([df_test['id'], predictions], axis=1, ignore_index=True)
predictions.columns = ['id', 'score']
predictions

Unnamed: 0,id,score
0,6125,0.228028
1,26781,0.541191
2,13333,0.437553
3,53218,0.038625
4,84204,0.683139
...,...,...
19974,80341,0.310267
19975,5891,0.089879
19976,29091,0.022812
19977,85877,0.466163


In [100]:
predictions.to_csv('submission.csv', index=False)