In [1]:
#Initially forked from Bojan's kernel here: https://www.kaggle.com/tunguz/bow-meta-text-and-dense-features-lb-0-2242/code
#That kernel was forked from Nick Brook's kernel here: https://www.kaggle.com/nicapotato/bow-meta-text-and-dense-features-lgbm?scriptVersionId=3493400
#Used oof method from Faron's kernel here: https://www.kaggle.com/mmueller/stacking-starter?scriptVersionId=390867
#Used some text cleaning method from Muhammad Alfiansyah's kernel here: https://www.kaggle.com/muhammadalfiansyah/push-the-lgbm-v19
import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc


# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string

%matplotlib inline
NFOLDS = 5
SEED = 42




In [2]:
lgb.LGBMRegressor()

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0, n_estimators=10, nthread=-1, num_leaves=31,
       objective='regression', reg_alpha=0, reg_lambda=0, seed=0,
       silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)

In [3]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True,lgbm=False):
        if(seed_bool == True):
            if lgbm:
                print('with no random state need to check that')
            else : 
                params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
        
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train.tocsr()[train_index]
        y_tr = y[train_index]
        x_te = x_train.tocsr()[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    
'''def cleanName(text):
    try:
        textProc = text.lower()
        textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        regex = re.compile(u'[^[:alpha:]]')
        textProc = regex.sub(" ", textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"'''

def cleanName(text):
    try:
        textProc = text.lower()
        textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        regex = re.compile(u'[^[:alpha:]]')
        textProc = regex.sub(" ", textProc)
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"
    
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

dtypes = {
        'category_name': 'category',
        'parent_category_name': 'category',
        'region': 'category',
        'item_seq_number': 'uint32',
        'user_type': 'category',
        'image_top_1': 'float32',
        'price':'float32',
        'deal_probability': 'float32'
        }
print("\nData Load Stage")
training = pd.read_csv('../data/train.csv.zip',compression='zip' ,index_col = "item_id", parse_dates = ["activation_date"],dtype=dtypes)
traindex = training.index

#training_index = df.loc[training.activation_date<=pd.to_datetime('2017-03-24')].index
#validation_index = df.loc[training.activation_date>=pd.to_datetime('2017-03-25')].index
testing = pd.read_csv('../data/test.csv.zip',compression='zip' ,index_col = "item_id", parse_dates = ["activation_date"],dtype=dtypes)
testdex = testing.index

ntrain = training.shape[0]
ntest = testing.shape[0]
y = training.deal_probability.copy()


kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)


training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))



Data Load Stage
Train shape: 1503424 Rows, 16 Columns
Test shape: 508438 Rows, 16 Columns
Combine Train and Test

All Data shape: 2011862 Rows, 16 Columns


In [4]:
df.head()

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0
2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0
ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0
02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0
7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0


In [5]:
print("Feature Engineering")
df["price"] = np.log(df["price"]+0.001)
df["price"].fillna(-999,inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

#print("\nCreate Time Variables")
df["Weekday"] = df['activation_date'].dt.weekday
df["Weekd of Year"] = df['activation_date'].dt.week
df["Day of Month"] = df['activation_date'].dt.day
dfdex = df.index
agg_df = pd.read_csv('./aggregated_features.csv')
df = df.merge(agg_df,on='user_id', how='left')
df.index = dfdex
# Create Validation Index and Remove Dead Variables

df.drop(["activation_date","image"],axis=1,inplace=True)

print("\nEncode Variables")
categorical = ["user_id","region","city","user_type","image_top_1"]
print("Encoding :",categorical)

# Encoder:
lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col] = lbl.fit_transform(df[col].astype(str))
    
print("\nText Features")

# Feature Engineering 
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2'])]),axis=1) # Group Param Features
    
df.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

# Meta Text Features
textfeats = ["description","text_feat", "title","parent_category_name","category_name"]

df['title'] = df['title'].apply(lambda x: cleanName(x))
df["description"]   = df["description"].apply(lambda x: cleanName(x))
for cols in textfeats:
    from string import digits
    
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('.') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    remove_digits = str.maketrans('', '', digits)
    df[cols] = df[cols].str.translate(remove_digits)
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words

print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
#russian_stop = set(stopwords.words('russian'))
df=df.fillna(-9999)
tfidf_para = {
    "stop_words": None,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    "min_df":50,
    "max_df":.7,
    "smooth_idf":False
}


def get_col(col_name): return lambda x: x[col_name]
##I added to the max_features of the description. It did not change my score much but it may be worth investigating
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 1),
            max_features=1000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('text_feat',CountVectorizer(
            ngram_range=(1, 1),
            max_features=1000,
            preprocessor=get_col('text_feat'))),
        ('parent_category_name',CountVectorizer(
            ngram_range=(1, 1),
            max_features=1000,
            preprocessor=get_col('parent_category_name'))),
        ('category_name',CountVectorizer(
            ngram_range=(1, 1),
            max_features=1000,
            preprocessor=get_col('category_name'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 1),
            **tfidf_para,
            max_features=1000,
            preprocessor=get_col('title')))
    ])
    
start_vect=time.time()

#Fit my vectorizer on the entire dataset instead of the training rows
#Score improved by .0001
vectorizer.fit(df.to_dict('records'))

ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

# Drop Text Cols
df.drop(textfeats, axis=1,inplace=True)

print("df shape :", df.shape)
print("ready_df shape :", ready_df.shape)
print('len vocab feature :', len(tfvocab) )

Feature Engineering

Encode Variables
Encoding : ['user_id', 'region', 'city', 'user_type', 'image_top_1']

Text Features

[TF-IDF] Term Frequency Inverse Document Frequency Stage
Vectorization Runtime: 7.72 Minutes
df shape : (2011862, 33)
ready_df shape : (2011862, 2927)
len vocab feature : 2927


In [None]:
23+18529

18552

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
########################################################################################################
# Combine Dense Features with Sparse Text Bag of Words Features
X = hstack([csr_matrix(df.loc[traindex,:].values),ready_df[0:traindex.shape[0]]]) # Sparse Matrix
testing = hstack([csr_matrix(df.loc[testdex,:].values),ready_df[traindex.shape[0]:]])
#tfvocab = df.columns.tolist() + tfvocab
for shape in [X,testing]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ",len(tfvocab))
################################################################################################
ridge_params = {'alpha':30.0, 'fit_intercept':True, 'normalize':False, 'copy_X':True,
                'max_iter':None, 'tol':0.0025, 'solver':'auto', 'random_state':SEED}

#Ridge oof method from Faron's kernel
#I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
#It doesn't really add much to the score, but it does help lightgbm converge faster
ridge = SklearnWrapper(clf=Ridge, seed = SEED, params = ridge_params)
ridge_oof_train, ridge_oof_test = get_oof(ridge, X, y, testing)

rms = sqrt(mean_squared_error(y, ridge_oof_train))
print('Ridge OOF RMSE: {}'.format(rms))

print("Modeling Stage")

ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])
print(ridge_preds.shape)
df['ridge_preds'] = ridge_preds
########################################################################################################

########################################################################################################
print("Light Gradient Boosting Regressor")
lgbm_pa =  {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 100,
    'num_leaves': 450,
    'max_depth': 15,
    'subsample' : 0.8,
    'learning_rate': 0.02,
    'colsample_bytree': 0.65
                }  
'''lgbm_params =  ( boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0, n_estimators=10, nthread=-1, num_leaves=31,
       objective='regression', reg_alpha=0, reg_lambda=0, seed=0,
       silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)'''


lgbm = SklearnWrapper(clf=lgb.LGBMRegressor, seed = SEED, params = lgbm_pa,lgbm=True)

lgbm_oof_train, lgbm_oof_test = get_oof(lgbm, X, y, testing)

rms = sqrt(mean_squared_error(y, lgbm_oof_train))
print('lgbm OOF RMSE: {}'.format(rms))

print("Modeling Stage")

lgb_preds = np.concatenate([lgbm_oof_train, lgbm_oof_test])
print(lgb_preds.shape)
df['lgb_preds'] = lgb_preds
########################################################################################################
print("Light Gradient Boosting Regressor")
lgbm_pa =  {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 300,
    'num_leaves': 650,
    'max_depth': 20,
    'subsample' : 0.8,
    'learning_rate': 0.02,
    'colsample_bytree': 0.9
                }  
'''lgbm_params =  ( boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0, n_estimators=10, nthread=-1, num_leaves=31,
       objective='regression', reg_alpha=0, reg_lambda=0, seed=0,
       silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)'''


lgbm = SklearnWrapper(clf=lgb.LGBMRegressor, seed = SEED, params = lgbm_pa,lgbm=True)

lgbm_oof_train, lgbm_oof_test = get_oof(lgbm, X, y, testing)

rms = sqrt(mean_squared_error(y, lgbm_oof_train))
print('lgbm OOF RMSE: {}'.format(rms))

print("Modeling Stage")

lgb_preds = np.concatenate([lgbm_oof_train, lgbm_oof_test])
print(lgb_preds.shape)
df['lgb_preds_2'] = lgb_preds
########################################################################################################
print("Light Gradient Boosting Regressor")
lgbm_pa =  {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 200,
    'num_leaves': 850,
    #'max_depth': 15,
    'subsample' : 0.8,
    'learning_rate': 0.02,
    'colsample_bytree': 0.5
                }  
'''lgbm_params =  ( boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0, n_estimators=10, nthread=-1, num_leaves=31,
       objective='regression', reg_alpha=0, reg_lambda=0, seed=0,
       silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)'''


lgbm = SklearnWrapper(clf=lgb.LGBMRegressor, seed = SEED, params = lgbm_pa,lgbm=True)

lgbm_oof_train, lgbm_oof_test = get_oof(lgbm, X, y, testing)

rms = sqrt(mean_squared_error(y, lgbm_oof_train))
print('lgbm OOF RMSE: {}'.format(rms))

print("Modeling Stage")

lgb_preds = np.concatenate([lgbm_oof_train, lgbm_oof_test])
print(lgb_preds.shape)
df['lgb_preds_3'] = lgb_preds
########################################################################################################
print("Light Gradient Boosting Regressor")
lgbm_pa =  {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 700,
    'num_leaves': 320,
    'max_depth': 15,
    'subsample' : 0.95,
    'learning_rate': 0.02,
    'colsample_bytree': 0.95
                }  
'''lgbm_params =  ( boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
       max_bin=255, max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_split_gain=0, n_estimators=10, nthread=-1, num_leaves=31,
       objective='regression', reg_alpha=0, reg_lambda=0, seed=0,
       silent=True, subsample=1, subsample_for_bin=50000, subsample_freq=1)'''


lgbm = SklearnWrapper(clf=lgb.LGBMRegressor, seed = SEED, params = lgbm_pa,lgbm=True)

lgbm_oof_train, lgbm_oof_test = get_oof(lgbm, X, y, testing)

rms = sqrt(mean_squared_error(y, lgbm_oof_train))
print('lgbm OOF RMSE: {}'.format(rms))

print("Modeling Stage")

lgb_preds = np.concatenate([lgbm_oof_train, lgbm_oof_test])
print(lgb_preds.shape)
df['lgb_preds_4'] = lgb_preds


1503424 Rows and 2960 Cols
508438 Rows and 2960 Cols
Feature Names Length:  2927

Fold 0

Fold 1

Fold 2


In [None]:
#df['preds_diff'] = df['lgb_preds'] - df['ridge_preds']
#df['preds_sum'] = 0.8*df['lgb_preds'] + 0.2*df['ridge_preds']
#df['predslgb_sum'] = 0.25*df['lgb_preds'] + 0.25*df['lgb_preds_2']+0.25*df['lgb_preds_3']+0.25*df['lgb_preds_3']

In [None]:
df.head()

In [None]:
########################################################################################################
# Combine Dense Features with Sparse Text Bag of Words Features
X = hstack([csr_matrix(df.loc[traindex,:].values),ready_df[0:traindex.shape[0]]]) # Sparse Matrix
testing = hstack([csr_matrix(df.loc[testdex,:].values),ready_df[traindex.shape[0]:]])
tfvocab = df.columns.tolist() + tfvocab
for shape in [X,testing]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ",len(tfvocab))

In [None]:
#del df
#gc.collect();
#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=230)
#for shape in [X_train, X_valid]:
#    print("{} Rows and {} Cols".format(*shape.shape))

In [None]:
df.columns.tolist()

In [None]:
tfvocab

In [None]:
########################################################################################################
# Combine Dense Features with Sparse Text Bag of Words Featur

print("Light Gradient Boosting Regressor")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 15,
    #'num_leaves': 32,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.9,
    'bagging_freq': 2,
    'learning_rate': 0.02,
    'verbose': 0
}  




pred_test_full =0
cv_score = []
NFOLDS=10
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)
for i, (train_index, test_index) in enumerate(kf):
    print('\nFold {}'.format(i))

    xtr,xvl = X.tocsr()[train_index],X.tocsr()[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    
    lgtrain = lgb.Dataset(xtr, ytr,
                feature_name=tfvocab,
                categorical_feature = categorical)
    lgvalid = lgb.Dataset(xvl, yvl,
                feature_name=tfvocab,
                categorical_feature = categorical)

    modelstart = time.time()
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=16000,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=150,
        verbose_eval=100
    )
    pred_test = lgb_clf.predict(testing) 
    pred_test_full += pred_test
    cv_score.append(lgb_clf.best_score)


pred_lgb = pred_test_full/NFOLDS

In [None]:
import xgboost as xgb


Dparam = {'objective' : "reg:logistic",
          'booster' : "gbtree",
          'eval_metric' : "rmse",
          'nthread' : 8,
          'eta':0.07,
          'max_depth':18,
          'min_child_weight': 2,
          'gamma' :0,
          'subsample':0.7,
          'colsample_bytree':0.7,
          'aplha':0,
          'lambda':0,
          'nrounds' : 1700}  

pred_test_full_xgb=0
cv_score_xgb = []
NFOLDS=10
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)
for i, (train_index, test_index) in enumerate(kf):
    print('\nFold {}'.format(i))
    xtr,xvl = X.tocsr()[train_index],X.tocsr()[test_index]
    ytr,yvl = y[train_index],y[test_index]
    dtrain =xgb.DMatrix(data = xtr, label = ytr)
    dval =xgb.DMatrix(data = xvl, label = yvl)
    watchlist = [(dval, 'eval')]
    print("Training Model")
    m_xgb=xgb.train(params=Dparam,dtrain=dtrain,
                    num_boost_round=Dparam['nrounds'],
                    early_stopping_rounds=100,evals=watchlist)
    dtest = xgb.DMatrix(data = testing)
    pred_test_xgb = m_xgb.predict(dtest) 
    pred_test_full_xgb += pred_test_xgb
    cv_score_xgb.append(m_xgb.best_score)


pred_xgb = pred_test_full_xgb/NFOLDS

In [None]:
cv_score

In [None]:
print("Model Evaluation Stage")
#print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
#lgpred = lgb_clf.predict(testing) 
lgpred = pred_lgb
#Mixing lightgbm with ridge. I haven't really tested if this improves the score or not
#blend = 0.95*lgpred + 0.05*ridge_oof_test[:,0]
lgsub = pd.DataFrame(lgpred,columns=["deal_probability"],index=testdex)
lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1
lgsub.to_csv("lgsub_Stacking_CV_5.csv",index=True,header=True)
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

In [None]:
lgtrain.save_binary('trainwithouttxt.bin')
lgvalid.save_binary('validwithouttxt.bin')

In [None]:
print("Model Evaluation Stage")
#print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
#lgpred = lgb_clf.predict(testing) 
xgpred = pred_xgb
#Mixing lightgbm with ridge. I haven't really tested if this improves the score or not
#blend = 0.95*lgpred + 0.05*ridge_oof_test[:,0]
xgsub = pd.DataFrame(xgpred,columns=["deal_probability"],index=testdex)
xgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1
xgsub.to_csv("xgp_Stacking_CV_5.csv",index=True,header=True)
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

In [None]:
print("Model Evaluation Stage")
#print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid))))
#lgpred = lgb_clf.predict(testing) 
xgpred = 0.5*pred_xgb + 0.5*lgpred
#Mixing lightgbm with ridge. I haven't really tested if this improves the score or not
#blend = 0.95*lgpred + 0.05*ridge_oof_test[:,0]
xgsub = pd.DataFrame(xgpred,columns=["deal_probability"],index=testdex)
xgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1
xgsub.to_csv("xgp_lgb_Stacking_CV_5.csv",index=True,header=True)
print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

In [None]:
print("ALL done ....")