In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_import = pd.read_csv("/input/train.csv")

test_import = pd.read_csv("/input/test.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../input/train.csv'

In [None]:
X = train_import.drop(["target"], axis=1)
y = train_import["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
# Need to encode the categorical data. First, split categorical data from numerical, then encode, then recombine

# Split data 
# Categorical first
holder = (X_train.dtypes == 'object')
object_cols = list(holder[holder].index)

# Now Numerical Columns
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Encode split data
OHE = OneHotEncoder(handle_unknown="ignore", sparse=False)
OH_columns_train = pd.DataFrame(OHE.fit_transform(X_train[object_cols]))
OH_columns_test = pd.DataFrame(OHE.transform(X_test[object_cols]))

OH_columns_train.index = X_train.index
OH_columns_test.index = X_test.index

# Now recombine the encoded categorical data with numerical data
OH_X_train = pd.concat([OH_columns_train, num_X_train], axis = 1)
OH_X_test = pd.concat([OH_columns_test, num_X_test], axis = 1)

In [None]:
# Build params dictionary
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample':1,
    'colsample_bytree':1,
    'objective': 'reg:linear',
    'eval_metric': 'mae'
}

# Instantiate models, fit and set predictions
model_xgb = XGBRegressor(n_estimators=999, 
                         random_state=42,
                         max_depth = 4,
                         min_child_weight = 1,
                         eta=.05,
                         subsample=1,
                         colsample_bytree=1)
model_xgb.fit(OH_X_train, y_train,
             early_stopping_rounds= 10,
             eval_set=[(OH_X_test, y_test)],
             verbose=False)
predictions_xgb = model_xgb.predict(OH_X_test)

In [None]:
# Instantiate second model for testing against
model_sgdc = SGDRegressor(random_state=42)
OH_X_train.head()
y_train.head()

model_sgdc.fit(OH_X_train, y_train)
predictions_sgdc = model_sgdc.predict(OH_X_test)


In [None]:
# Evaluate using RMSE against test data
rms_xgb = mean_squared_error(y_test, predictions_xgb, squared=False)
rms_sgdc = mean_squared_error(y_test, predictions_sgdc, squared = False)
print(f"XGBoost RMSE Score: {rms_xgb}")
print(f"SGDClassifier RMSE Score: {rms_sgdc}")

In [None]:
# Evaluate against full training set

# First encode full data set
# Need to encode the categorical data. First, split categorical data from numerical, then encode, then recombine

# Split data 
# Categorical first
holder = (X.dtypes == 'object')
object_cols = list(holder[holder].index)

# Now Numerical Columns
num_X_train = X.drop(object_cols, axis=1)

# Encode split data
OHE = OneHotEncoder(handle_unknown="ignore", sparse=False)
OH_columns_train = pd.DataFrame(OHE.fit_transform(X[object_cols]))

OH_columns_train.index = X.index

# Now recombine the encoded categorical data with numerical data
OH_X = pd.concat([OH_columns_train, num_X_train], axis = 1)

In [None]:
# Now fit model on full training data
xgb_on_full_data = XGBRegressor(n_estimators=999, 
                         random_state=42,
                         max_depth = 4,
                         min_child_weight = 1,
                         eta=.05,
                         subsample=1,
                         colsample_bytree=1)

xgb_on_full_data.fit(OH_X, y)

In [None]:
# Evaluate on test data from separate CSV
# First encode test data

holder = (test_import.dtypes == 'object')
object_cols = list(holder[holder].index)

# Now Numerical Columns
num_X_train = test_import.drop(object_cols, axis=1)

# Encode split data
OHE = OneHotEncoder(handle_unknown="ignore", sparse=False)
OH_columns_train = pd.DataFrame(OHE.fit_transform(test_import[object_cols]))

OH_columns_train.index = test_import.index

# Now recombine the encoded categorical data with numerical data
OH_test = pd.concat([OH_columns_train, num_X_train], axis = 1)


In [None]:
# Predict using encoded test data
test_preds = xgb_on_full_data.predict(OH_test)

In [None]:
# Generate submission CSV
submission = pd.DataFrame({"ID": OH_test.id,
                         "target": test_preds})
submission.to_csv("30dayscompsub.csv", index=False)