In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
X_full = pd.read_csv('train.csv', index_col='id')
X_test_full = pd.read_csv('test.csv', index_col='id')

In [4]:
X_full.dropna(subset=['target'], axis=0, inplace=True)
y = X_full.target
X_full.drop('target', axis=1, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, test_size=0.2, random_state=1)

In [7]:
X_full.columns

Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')

In [12]:
print("Unique values in 'Condition2' column in training data:", X_train_full['cat9'].unique(), len(X_train_full['cat9'].unique()))
print("\nUnique values in 'Condition2' column in validation data:", X_valid_full['cat9'].unique(), len(X_valid_full['cat9'].unique()))

Unique values in 'Condition2' column in training data: ['B' 'K' 'I' 'J' 'F' 'N' 'G' 'H' 'L' 'O' 'D' 'E' 'M' 'A' 'C'] 15

Unique values in 'Condition2' column in validation data: ['I' 'O' 'F' 'L' 'J' 'B' 'N' 'A' 'G' 'H' 'K' 'M' 'C' 'D' 'E'] 15


In [14]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']

Categorical columns that will be dropped from the dataset: []


In [16]:
from sklearn.preprocessing import OrdinalEncoder

In [18]:
OE = OrdinalEncoder()

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

label_X_train[object_cols] = OE.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = OE.transform(X_valid[object_cols])

In [19]:
my_model = RandomForestRegressor(n_estimators=100, random_state=1)

In [21]:
my_model.fit(label_X_train, y_train)

RandomForestRegressor(random_state=1)

# ...

In [22]:
pr = my_model.predict(label_X_train)
preds = my_model.predict(label_X_valid)
print('RMSE:', mean_squared_error(y_train, pr, squared=False))
print('RMSE:', mean_squared_error(y_vaid, preds, squared=False))

RMSE: 0.2755117608474084
RMSE: 0.7335278384292354


In [25]:
label_X_test = X_test_full.copy()
label_X_test[object_cols] = OE.transform(X_test_full[object_cols])

In [27]:
preds_test = my_model.predict(label_X_test)

In [29]:
output = pd.DataFrame({'id': X_test_full.index, 'target': preds_test})

output.to_csv('submission_2.csv', index=False)

In [30]:
from xgboost import XGBRegressor

In [31]:
my_model_1 = XGBRegressor(n_estimators=500, learning_rate=0.1, random_state=1)

my_model_1.fit(label_X_train, y_train)

pred = my_model_1.predict(label_X_valid)

print('RMSE:', mean_squared_error(y_valid, pred, squared=False))



RMSE: 0.7190911750303717


In [32]:
pred_test = my_model_1.predict(label_X_test)



In [34]:
output1 = pd.DataFrame({'id': X_test_full.index, 'target': pred_test})

output1.to_csv('submission_3.csv', index=False)