In [None]:
import pandas as pd
import numpy as np
from random import choices,choice
import catboost as cb 
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
df = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2022/data.csv")
sample = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv")
df_toy = df.sample(frac = 0.01, random_state=1)  #used for quick testing during writing the code
with pd.option_context('display.max_columns', 90):
    display (df)

### EDA Summary
Previous great notebooks have shown the main insights regarding the data: 
 * There are 4 main Feature groups F_1 - F_4.
 * F_2 contains descrete values with 15-35 values per feature and doesn't contain any missing values.
 * F_1 and F_3 - each feature shows no correlation to other features 
 * F_4 shows good correlation between features.
 * The NA's are randomly distributed in the data (rows and cols wise).
 


In [None]:
#Some subset of the columns for future use
cont_cols = [df.columns[i] for i,typ in enumerate(df.dtypes) if str(typ) == 'float64']
F_4_cols = [col for col in cont_cols if col.find("F_4") != -1]
F_13_cols = [col for col in cont_cols if col not in F_4_cols]
F_2_cols = [col for col in df.columns if col.find("F_2") != -1]
int_cols = [i for i,typ in enumerate(df.dtypes) if str(typ) == 'int64'][1:]
output_cols = cont_cols + ["row_id"]

In [None]:
# Create a sample format output
def return_melted_res(df, val = ""):
    '''
    transform df to melted/long format, similar to the submission file
    '''
    if val : val = "_" + val
    return (df
         .loc[:,output_cols]
         .melt(id_vars = ['row_id'])
         .assign(row_col = lambda df : df
                    .row_id.astype(str).str.cat(df.variable, sep = "-")) #generate row_col column
         .loc[:,['value', 'row_col']] #keep only value and row_col columns
         .reindex(columns = ['row_col','value']) #Change col order
         .rename (columns  = {"row_col": "row-col", "value": f'value{val}'}) #rename columns
    )
return_melted_res(df_toy, "Test_Toy").head(3)

## Defining some helper functions
For testing and validation I created the following function that sample the Dataframe and add a random NA to one of the columns in each row.
later we will be able to compare the predicted value to the original value for accuracy calculations.
Since we adding NA's, the results are expected to be slightly worse compared to final training but shouldn't change the big picture and give us easy way to check our model.
for simplicity I decided to go for this direction instead of splitting the data to train/validaion. 

In [None]:
def return_train_vals (df_train_orig, df_train):
    '''
    Get the training data before and after adding NA's
    return in long/submission format only the added NA's with the original value
    '''
    return (df_train_orig
     .pipe(return_melted_res) #switch to long/submission format
     .query(f'~{"value"}.isna()', engine='python') #keep only non NA
     .merge ((df_train
                       .pipe(return_melted_res)
                       .query(f'{"value"}.isna()', engine='python'))
             , left_on ="row-col", right_on = "row-col", how = 'inner'
            )
     .drop(columns = "value_y")
     .rename (columns = {"value_x":"value"})
                       )
             
    

def get_train_subset(df, f = 0.1, sample = True):
    ''' Sample the DataFrame and add random NA's to cont_columns
        return (df_train_vals with the original values at long format,
                the train dataframe with the new random NA's)
        
    '''
    if sample: df = df.sample(frac = f, random_state=1)
    else: df = df.loc[:int(df.shape[0]*f),:]
    df_ = df.copy()
    
    nrows = df_.shape[0]
    rows_update = choices (df_.index.values, k=nrows)
    
    for row in rows_update:
        df_.loc[row,choice(cont_cols)] = np.nan
    
    df_train_vals = return_train_vals(df,df_)
    
    return df_train_vals,df_

print (f'Before: {df_toy.isna().mean().head(3)}, After : {get_train_subset(df_toy)[1].isna().mean().head(3)}')
print (f'Long format of the original values that Will be used for accuraccy:\n {get_train_subset(df_toy)[0].head(3)}')

We will also add addition function to calculate the MSE of the models prediction based on the prediction of the generated NA's, which we know what was the actual values.
Since I wanted to compare the results between different features, I normalized the MSE (NRMSE) by divide the RMSE by the range of that feature.

We know that there is a major difference between F_1 + F_3 to the F_4, I assumed we will need different model for each group of features.


In [None]:
def calc_MSE(df_preds,df_train_vals):
    '''
    Receive three Dataframes in melt format
    df_preds - current predicition
    df_train_vals - dataframe with the added NA's cells with the original values
    
    return MSE of predicted values and original values'''
    
    df_acc = (df_preds
         .merge(df_train_vals)
    )

    return df_acc, mean_squared_error (df_acc.value_preds.values,df_acc.value.values)

def calc_MSE_group (df_, col_name):
    '''
    Calculate the NRMSE for each column
    the RMSE is normalized by the columns range
    '''
    return (df_
     .rename (columns = {"row-col":"row_col"})
     .assign (row = lambda df_ : df_.row_col.str.split("-", expand = True)[0],
              col = lambda df_ : df_.row_col.str.split("-", expand = True)[1],
              ss = lambda df_ : (df_.value - df_.value_preds)**2
             )
     .groupby(by = 'col')
     .agg( ss_mean =( 'ss' , np.mean ) , val_max =( 'value' , max ) ,val_min =( 'value' , min ))
     .assign (NRMSE = lambda df: (df.ss_mean)**0.5/((df.val_max)-df.val_min))
     .rename (columns = {"NRMSE" : "NRMSE_" + col_name})
     .sort_index()
     .loc[:,"NRMSE_" + col_name]
     .to_frame()
           )

def get_mse_from_trained(df_train_vals,df_F4_trained,df_F13_trained, name = "Train_Res"):
    '''
    get three dataframes as input and return the MSE
    df_train - current subset we work on
    df_F4_trained - the trained DF of F4 columns (including row id)
    df_13_trained - the trained DF of F13 columns
    
    output the MSE score of current training and NRMSE per feature
    '''
    df_trained = pd.concat([df_F4_trained, df_F13_trained], axis = 1)
    df_trained_melt = return_melted_res(df_trained, "preds")
    
    df_mse_trained, mse_trained = calc_MSE (df_trained_melt,df_train_vals)
    df_NRMSE_per_grop = calc_MSE_group(df_mse_trained, name)
    print (f'MSE of Prediction is {mse_trained}')
    
    return df_NRMSE_per_grop, mse_trained

## And lets add some model functions

The code for using GBM for calculating missing values is from the great effective pandas book by Matt Harrison.
it shows elegant way to impute the missing values based on other feature.
Since F_4 features show strong correlation to each other, we will want to use this model for it.
I added option to use either LGBM or CatBoost regressors.

And I also added SKLearn Simple Imputer for F_1 and F_3 features (for now).

In [None]:

def prep_for_ml(df):
    return (df
               .assign(**{col:df[col].astype(float) for col in df.select_dtypes('number')},
                      **{col:df[col].astype(str).fillna('') for col in df.select_dtypes(['object','category'])})
           )

def predict_col(df_,col,iters, modelType = "CatBoost"):
    '''
    Predict NAs based on other features and return the updated column
    '''
    df_ = prep_for_ml(df_)
    print(col)
    missing = df_.query(f'~{col}.isna()', engine='python')
    cat_idx = [i for i,typ in enumerate(df_.drop(columns = [col]).dtypes) if str(typ) == 'int64'][1:]
    X = (missing
            .drop(columns = [col])
            .values
        )
    y = missing[col]
    
    if modelType == "CatBoost":
        model = cb.CatBoostRegressor(iterations = iters, cat_features = cat_idx, logging_level= "Silent")
        model.fit (X, y, cat_features = cat_idx)
    else:
        model = LGBMRegressor(n_estimators=iters,metric='r2')
        model.fit (X, y)
        
    print (f'Col: {col}, Score: {model.score(X,y)}')
    pred = model.predict(df_.drop(columns = [col]))
    return df_[col].where (~df_[col].isna(), pred)

def train_GBM(df,iters,col_names, modelType):
    '''
    Iterate through col_names and update NA's with predicted values
    '''
    df_cat = (df[col_names]
                  .copy(deep = True)
             )
    for c,col in enumerate(col_names):
        df_cat.loc[:,col] = predict_col(df_cat,col,iters,modelType)
        print (f'finished col {c} out of {len(col_names)}')
    
    df_cat.row_id = df_cat.row_id.astype('int64')
    return df_cat


    '''
    use SKLearn to return the imputed mean value
    '''
def train_impute(df,col_names, strategy = "mean"):
    df_imp = df[col_names].copy(deep = True)
    imp = SimpleImputer(
            missing_values=np.nan,
            strategy=strategy) 
    

    df_imp[:] = imp.fit_transform(df_imp[col_names])
    return df_imp

## Finally, let's train a model and see some results

In [None]:
df_train_vals ,df_train = get_train_subset(df, f=0.01)

# ************************************************ #
df_F4_trained = train_GBM(df_train, iters = 200, col_names = F_4_cols + ["row_id"], modelType = "CatBoost")
df_F13_trained = train_impute(df_train,F_13_cols, strategy = "mean")


df_NRMSE_per_grop,mse_trained = get_mse_from_trained(df_train_vals,df_F4_trained,df_F13_trained, "test1_F13_MeanImpute_F4_Catboost_200iters")
df_NRMSE_per_grop.to_csv('Training_test1.csv',index = True)

# # ************************************************ #

df_F4_trained = train_GBM(df_train, iters = 2000, col_names = F_4_cols + ["row_id"], modelType = "CatBoost")
df_F13_trained = train_impute(df_train,F_13_cols, strategy = "median")


df_NRMSE_per_grop,mse_trained = get_mse_from_trained(df_train_vals,df_F4_trained,df_F13_trained, "test2_F13_MedianImpute_F4_Catboost_2000iters")
df_NRMSE_per_grop.to_csv('Training_test2.csv',index = True)

# ************************************************ #

df_train_vals ,df_train = get_train_subset(df, f=0.01, sample = False) 

df_F4_trained = train_GBM(df_train, iters = 20, col_names = F_4_cols + ["row_id"], modelType = "LGBMRegressor")
df_F13_trained = df_train[F_13_cols].fillna(method='ffill').fillna(0)


df_NRMSE_per_grop,mse_trained = get_mse_from_trained(df_train_vals,df_F4_trained,df_F13_trained, "test3_F13_FFill_F4_LGBM_20estimators")
df_NRMSE_per_grop.to_csv('Training_test3.csv',index = True)

# # ************************************************ #
# df_train_vals ,df_train = get_train_subset(df, f=0.01, sample = True) 

# df_F4_trained = train_GBM(df_train, iters = 20000, col_names = F_4_cols + ["row_id"], modelType = "LGBMRegressor")
# df_F13_trained = train_impute(df_train,F_13_cols, strategy = "mean")


# df_NRMSE_per_grop,mse_trained = get_mse_from_trained(df_train_vals,df_F4_trained,df_F13_trained, "test4_F13_Mean_F4_LGBM_20000estimators")
# df_NRMSE_per_grop.to_csv('Training_test4.csv',index = True)


In [None]:
# res_list = ["Training_test1.csv","Training_test2.csv","Training_test3.csv","Training_test4.csv"]
res_list = ["Training_test1.csv","Training_test2.csv","Training_test3.csv"]
res_dfs = [pd.read_csv(cur_df,index_col = "col") for cur_df in res_list]
res_df_all = pd.concat(res_dfs, axis = 1)

fig,ax = plt.subplots(dpi = 600, figsize = (10,5))
plt.xticks(np.arange(len(res_df_all.index)), label =res_df_all.index )

res_df_all.plot(kind = 'line', ax=ax, linewidth = 2)

plt.xticks(rotation = 90 )

plt.show()

## Conclusions 
F_1 and F_3:
* columns we can't see a difference between mean and median impute. 
* all the columns shows similar NRMSE accuracy.
* Forward Fill shows surprisingly bad results.

F_4 : 
* we can clearly see the regression models find good predictions. 
* We expect that the higher the regression depth/iterations the better the prediction will be, but we don't see it on this subset of data. we might need to use larger subset.
* LGBM shows slightly better performance compared to the Catboost
* for sure further parameter optimization will improve the performance on both models.

# Prepare submission

In [None]:
def prepare_submission (df_F4_trained,df_F13_trained,sample):
    '''
    get Trained DFs for F_4 cols and F1 + F3 columns and return the result in submission format
    '''
    df_trained = pd.concat([df_F4_trained, df_F13_trained], axis = 1)
    # This method works faster compared to a for loop but require more memory then available by kaggle
    # We first remove all the lines without NA since they are not part of the submission file
    df_trained = df_trained[df.isna().sum(axis=1) != 0]
        
    df_trained_melted = return_melted_res(df_trained, "preds")
    print ("Finish Melting, Start merge")
    
    return (sample
        .drop(axis =1 , columns =['value'])
        .merge(df_trained_melted)
        .rename (columns = {'value_preds':'value'}))

    




df_F4_trained = train_GBM(df, iters = 20, col_names = F_4_cols + ["row_id"], modelType = "CatBoost")
print ("Finished F4")
df_F13_trained = train_impute(df,F_13_cols, strategy = "median")
print ("Finished F13")
df_sub = prepare_submission(df_F4_trained,df_F13_trained,sample)

df_sub.to_csv('Submission.csv',index = False)

## Next Step : Time Series Prediction for F1_3

I find it hard to believe that features in F1 and F3 are totaly random. I just start to learn time-series, so i'll try to see if any techniqe from the Kaggle Time Series course will be usefull here.

I'm sure there is a better way, but I'll try to treat the series as a time date to get all the great pandas timedate features.

In [None]:
df.index = pd.date_range("2018-01-01", periods=df.shape[0], freq="180s")

#first let see if there is a trend
# df.loc[:,["F_1_0"]].plot()
(df.loc[:,["F_1_0"]].resample('D')
         .mean()
         .plot(figsize = (10,4), alpha = .5 , linewidth = 1, label ='Daily')
)


Clearly there is no global trend in the data.

lets check if there are any cycles  in data data

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf

plot_pacf(df.F_1_1.resample('D').mean(), lags=8)

Again, there isn't any auto-correlation signal. 
Lets check for seasonality:

In [None]:


def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

plot_periodogram(df.F_1_0.resample('D').mean());

As I wrote above, I just took the kaggle Time Series course so I'm not sure how to iterperate this graph. I think since we see peek on every period, it means there is no clear seasonality. 
lets assume the graph drops between Bimonthly and Monthly and add the Furier features.

In [None]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

fourier = CalendarFourier(freq="A", order=8)  # 10 sin/cos pairs for "A"nnual seasonality

dp = DeterministicProcess(
    index=df.index,
    constant=True,               # dummy feature for bias (y-intercept)
    order=1,                     # trend (order 1 means linear)
    seasonal=False,               # weekly seasonality (indicators)
    additional_terms=[fourier],  # annual seasonality (fourier)
    drop=True,                   # drop terms to avoid collinearity
)

X = dp.in_sample()  # create features for dates in tunnel.index
freq_cols = list(X.columns.values)
df_freq = pd.concat ([df, X], axis = 1)



def train_GBM_freq(df,iters,col_names,freq_cols,modelType = "CatBoost"):
    df_cat = (df[col_names + freq_cols]
                  .copy(deep = True)
             )
    for c,col in enumerate(col_names):
        df_cat.loc[:,col] = predict_col(df_cat[freq_cols + [col]],col,iters, modelType)
#         print (f'finished col {c} out of {len(col_names)}')
    
#     df_cat.row_id = df_cat.row_id.astype('int64')
    return df_cat[col_names]

In [None]:
df_train_vals ,df_train = get_train_subset(df_freq, f=0.01)
df_F4_trained = train_impute(df_train,F_4_cols + ["row_id"], strategy = "mean")
df_F4_trained.row_id = df_F4_trained.row_id.astype('int64')

df_F13_trained_freq = train_GBM_freq(df_train,20,F_13_cols,freq_cols)

df_NRMSE_per_grop,mse_trained = get_mse_from_trained(df_train_vals,df_F4_trained,df_F13_trained_freq, "test5_F13_Freq8_F4_mean")
df_NRMSE_per_grop.to_csv('Training_test5.csv',index = True)


In [None]:
res_list = ["Training_test1.csv","Training_test2.csv","Training_test3.csv","Training_test5.csv"]
res_dfs = [pd.read_csv(cur_df,index_col = "col") for cur_df in res_list]
res_df_all = pd.concat(res_dfs, axis = 1)

fig,ax = plt.subplots(dpi = 600, figsize = (10,5))
plt.xticks(np.arange(len(res_df_all.index)), label =res_df_all.index )

res_df_all.loc[F_13_cols[:20],:].plot(kind = 'line', ax=ax, linewidth = 2)

plt.xticks(rotation = 90 )

plt.show()


## Bottom Line
* L_4 features can be easily modeled with a GBM regression models.
* It doesn't appear to be any clear trend/seasonality/cycles in features F1-F3.
* Since L_2 aren't correlated to any of the features with the missing data it is also not clear how it can be used.

