In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
X_full = pd.read_csv('train_modified.csv', index_col='id')
X_test_full = pd.read_csv('test_modified.csv', index_col='id')

In [3]:
X_full.dropna(subset=['target'], axis=0, inplace=True)
y = X_full.target
X_full.drop('target', axis=1, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, test_size=0.2, random_state=1)

In [6]:
num_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes in ['int64', 'float64']]

cat_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == 'object']

In [7]:
my_cols = num_cols + cat_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [8]:
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

In [9]:
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [11]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

In [12]:
num_trans = SimpleImputer(strategy='mean')

cat_trans = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', num_trans, num_cols),
                                        ('cat', cat_trans, cat_cols)])

In [14]:
my_model = XGBRegressor(n_estimators=500, learning_rate=0.1, random_state=3)

my_model.fit(X_train, y_train)

preds = my_model.predict(X_valid)

pre = my_model.predict(X_train)
print('RMSE:', mean_squared_error(y_train, pre, squared=False))

print('RMSE:', mean_squared_error(y_valid, preds, squared=False))



RMSE: 0.6715707072919757
RMSE: 0.7188082421704451


In [15]:
preds_test = my_model.predict(X_test)



In [16]:
output = pd.DataFrame({'id': X_test.index, 'target': preds_test})

output.to_csv('submission_3.csv', index=False)

In [None]:
RMSE: 0.6705650031716106
RMSE: 0.7229278409086914

In [23]:
X_train.head()

Unnamed: 0_level_0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat7_F,cat7_G,cat7_I,cat8_A,cat8_B,cat8_C,cat8_D,cat8_E,cat8_F,cat8_G
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
298173,0.558652,0.364985,0.953177,0.261481,0.285042,0.366598,0.552537,0.24217,0.427087,0.413445,...,0,0,0,0,0,1,0,0,0,0
299785,0.542071,0.309224,0.736795,0.455526,0.475618,0.386943,0.334639,0.256481,0.441711,0.423982,...,0,0,0,0,0,1,0,0,0,0
141896,0.398638,0.619891,1.003411,0.543077,0.458128,0.361442,-0.179812,0.649192,0.408889,0.282222,...,0,0,0,0,0,1,0,0,0,0
214015,0.079594,0.216732,0.34316,0.426862,0.274832,0.648311,0.568285,0.768072,0.569046,0.781905,...,0,0,0,1,0,0,0,0,0,0
82082,0.514264,0.495396,0.602715,0.45514,0.964888,0.49595,0.432111,0.380077,0.264304,0.280083,...,0,0,0,0,0,0,0,1,0,0
