In [206]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.optimize import minimize

In [207]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [208]:
train_data = train_data.set_index("Id")
test_data = test_data.set_index("Id")
leakage = ['SaleCondition', 'MoSold', 'YrSold']
#useless = ['Electrical', 'SaleType']

train_data = train_data.dropna(axis = 0, subset=['SalePrice'])#scot liniile cu valori nule la SalePrice
y = train_data.SalePrice #label la date
train_data = train_data.drop(columns=['SalePrice']) #scot label-ul
train_data = train_data.drop(columns=leakage + useless)#scot scurgerile

test_data = test_data.drop(columns = leakage)

# train_data = train_data.fillna(value=0)
# test_data = test_data.fillna(value=0)

In [209]:
categorical_data = [x for x in train_data.columns if train_data[x].dtype =='object']
numerical_data = [x for x in train_data.columns if train_data[x].dtype != 'object']
ordinal_data = ['Alley', 'LotShape', 'Utilities', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual',
                'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence']

categorical_data = [x for x in categorical_data if x not in ordinal_data]

In [210]:
ordinal_data_dict = dict()
ordinal_data_dict.update({'Alley': ['NA', 'Grvl', 'Pave']})
ordinal_data_dict.update( {'LotShape':['IR3', 'IR2', 'IR1', 'Reg']})
ordinal_data_dict.update({'Utilities':['ELO', 'NoSeWa', 'NoSewr', 'AllPub']})
ordinal_data_dict.update({'ExterQual':['Po', 'Fa', 'TA', 'Gd', 'Ex']})
ordinal_data_dict.update({
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NA'],
    'BsmtCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NA'],
    'BsmtExposure': ['No', 'Mn', 'Av', 'Gd', 'NA'],
    'BsmtFinType1': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'BsmtFinType2': ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
})
ordinal_data_dict.update({
    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'Functional': ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    'FireplaceQu': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageFinish': ['NA', 'Unf', 'RFn', 'Fin'],
    'GarageCond': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PavedDrive': ['N', 'P', 'Y'],
    'PoolQC': ['NA', 'Fa', 'TA', 'Gd', 'Ex'],
    'Fence': ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
    'GarageQual': ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
})

In [211]:
ordinal_encoder_dict = dict()
for x in ordinal_data:
    ordinal_encoder_dict.update({x: OrdinalEncoder(categories = [ordinal_data_dict[x]])})


In [212]:
categorical_transformer = Pipeline(steps= [('imputer', SimpleImputer(strategy="most_frequent")), ("onehotencoder", OneHotEncoder(handle_unknown='ignore'))])
ordinal_transformer_dict = dict()
for col in ordinal_data:
    ordinal_transformer_dict.update({col: Pipeline(steps=[('imputer', SimpleImputer(strategy="most_frequent")), ("ordinalencoder", ordinal_encoder_dict[col])])})
numerical_transformer = SimpleImputer(strategy='mean')

trsf = []
for col in ordinal_data:
    trsf.append((col + '_ord', ordinal_transformer_dict[col], [col]))
preprocessor = ColumnTransformer(
            transformers = trsf + [('cat', categorical_transformer, categorical_data), ('num', numerical_transformer, numerical_data)])
            #transformers = [('cat', categorical_transformer, categorical_data), ('num', numerical_transformer, numerical_data)])

In [225]:
n_estimators = 5000
learning_rate = 0.02

X_train, X_test, y_train, y_test = train_test_split(train_data, y)

my_pipeline = Pipeline(steps= [("preprocessor", preprocessor), ("model", XGBRegressor(n_estimators = n_estimators, learning_rate = learning_rate))])

In [229]:
# from sklearn.feature_selection import SequentialFeatureSelector

# X_train_processed = my_pipeline.named_steps['preprocessor'].transform(X_train)

# estimator = my_pipeline.named_steps['model']

# sfs = SequentialFeatureSelector(estimator=estimator, n_features_to_select=10, n_jobs = -1)

# sfs.fit(X_train_processed, y_train)


In [205]:
def score_pipeline(params):
    n_estimators = int(params[0])
    learning_rate = params[1]
    if n_estimators < 0 or learning_rate < 0:
        return 100000
    pipeline = Pipeline(steps= [("preprocessor", preprocessor), ("model", XGBRegressor(n_estimators = n_estimators, learning_rate = learning_rate))])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    error = mean_absolute_error(y_test, predictions)
    return error

initial_guess = (5000, 0.02)
score_pipeline(initial_guess)

15044.472816780823

In [199]:
my_pipeline.fit(X_train, y_train)
predictions = my_pipeline.predict(X_test)

In [200]:
error = mean_absolute_error(y_test, predictions)
error

15044.472816780823

In [201]:
my_pipeline.fit(train_data, y)

In [202]:
predictions_test = my_pipeline.predict(test_data)

In [203]:
raspuns = pd.Series(predictions_test, test_data.index, name="SalePrice")

In [204]:
raspuns.to_csv("raspuns.csv")