In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
import lightgbm as lgb

import os, gc, glob
import itertools
from tqdm import tqdm_notebook
import warnings
warnings.simplefilter('ignore')

In [2]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
['/kaggle/input/ashrae-catboost-infer/submission.csv',
 '/kaggle/input/ashrae-catboostiii-prediction/submission.csv',
 '/kaggle/input/ashrae-kfold-lightgbm/submission.csv',
 '/kaggle/input/ashrae-lgb-drop/submission.csv',
 '/kaggle/input/ashrae-no-folds/submission.csv',
 '/kaggle/input/ashrae-stratified-kfold/submission.csv',
 '/kaggle/input/ashrae-gkfold-lgb/submission.csv',
 '/kaggle/input/ashrae-2folds-all/submission.csv',
 '/kaggle/input/ashrae-embednn/submission.csv']

['/kaggle/input/ashrae-catboost-infer/submission.csv',
 '/kaggle/input/ashrae-catboostiii-prediction/submission.csv',
 '/kaggle/input/ashrae-kfold-lightgbm/submission.csv',
 '/kaggle/input/ashrae-lgb-drop/submission.csv',
 '/kaggle/input/ashrae-no-folds/submission.csv',
 '/kaggle/input/ashrae-stratified-kfold/submission.csv',
 '/kaggle/input/ashrae-gkfold-lgb/submission.csv',
 '/kaggle/input/ashrae-2folds-all/submission.csv',
 '/kaggle/input/ashrae-embednn/submission.csv']

In [4]:
#p0 cat
#p1 R
#p2 kfold
#p3 drop
#p4 skfold
#p5 kfold without b
#p6 gkfold
#p7 all
#p8 nn

In [5]:
#chioce 1:p6, p1, p4, p7, p8

In [6]:
data = pd.read_feather('/kaggle/input/ashrae-models-preds/subs.feather')
leak = pd.read_csv('/kaggle/input/ashrae-models-preds/leak.csv')

In [7]:
data = data.merge(leak, left_index=True, right_on=['row_id'], how='left').reset_index(drop=True)

In [8]:
train = data[~data['meter_reading'].isnull()]
test = data[data['meter_reading'].isnull()]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [9]:
feature = ['p0', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p8', 'p7']
NFOLD = 2
kf = KFold(n_splits=NFOLD)
preds = np.zeros(len(test))
oof = np.zeros(len(train))
models = []
for train_index, val_index in kf.split(train):
    train_features = train[feature].loc[train_index]
    train_target = train['meter_reading'].loc[train_index].map(np.log1p)
    
    val_features = train[feature].loc[val_index]
    val_target = train['meter_reading'].loc[val_index].map(np.log1p)
    lr = LinearRegression(n_jobs=-1)
    lr.fit(train_features, train_target)
    models.append(lr)
    preds += lr.predict(test[feature])/NFOLD
    oof[val_index] = lr.predict(val_features)
    del train_features, train_target, val_features, val_target
    gc.collect()
    
print('lr oof cv:', np.sqrt(mean_squared_error(train['meter_reading'].map(np.log1p), oof)))

lr oof cv: 0.9510658018056783


In [10]:
data.loc[test['row_id'].values, 'meter_reading'] = np.expm1(preds)
data.sort_values(['row_id'], inplace=True)
data['meter_reading'].isnull().sum()
data[['row_id', 'meter_reading']].to_csv('submission.csv', index=False)

In [11]:
data

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,row_id,meter_reading
0,5.594895,5.283822,5.070209,5.171024,5.051921,5.188169,5.105268,5.006188,5.170560,0,173.370300
1,4.471448,4.157661,4.332657,4.346334,4.378818,4.444156,4.350743,4.441184,4.584682,1,53.512720
2,2.505161,2.199738,2.731363,2.425687,2.632862,2.084031,2.134552,2.332160,2.419872,2,6.143042
3,5.782856,5.789957,5.624600,5.394196,5.417315,5.702923,5.720874,5.574283,5.599818,3,101.701470
4,6.803643,7.040636,7.082393,6.892032,6.862860,7.305004,7.260651,7.049832,6.688233,4,1141.240700
...,...,...,...,...,...,...,...,...,...,...,...
41697595,2.212548,2.115184,2.051520,2.041869,1.966108,2.083446,1.988594,1.988963,2.245174,41697595,6.927099
41697596,1.954034,2.177629,1.711465,1.749200,1.635521,1.727292,1.662957,1.656605,2.037146,41697596,6.099778
41697597,1.236636,1.972442,1.348969,1.242713,1.370035,2.158276,2.130444,2.139587,1.771997,41697597,4.461183
41697598,5.205480,5.215278,5.191372,5.117185,5.196886,5.191440,5.218607,5.196023,5.093464,41697598,165.555772


In [12]:
models[1].coef_

array([-0.02191364,  0.48070002, -0.17384599,  0.40715542,  0.18679708,
        0.11762328,  0.41006768,  0.02123486, -0.46739283], dtype=float32)

In [13]:
sum(models[0].coef_)

1.006444739177823

In [14]:
models[0].coef_

array([ 0.10045822,  0.47960997, -0.10574937,  0.12499895,  0.18430434,
        0.02294191,  0.49303854,  0.15306675, -0.44622457], dtype=float32)