In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

#import data
data = pd.read_csv('https://raw.githubusercontent.com/NuttakitDW/DeepWork_216/master/train_data.csv')
test_X = pd.read_csv('https://raw.githubusercontent.com/NuttakitDW/DeepWork_216/master/test_data.csv')

#prepare variable
y = data.income
X = data.drop(['income', 'id'], axis=1)

#seperate column by type
categorical_cols = [cname for cname in ['gender','ocp_cd', 'age']]
numerical_cols = [cname for cname in X if cname not in categorical_cols]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

#create model
last_model = xgb.XGBRegressor(objective="reg:linear", random_state=42, 
                             learning_rate=0.01, gamma=0.3, max_depth=10, 
                             colsample_bytree=1, subsample=0.7, reg_alpha=0, reg_lambda=3,
                             min_child_weight=7, n_estimators=250)
#create pipeline
last_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', last_model)
                             ])

#training model
last_pipeline.fit(X, y)

#prediction
preds = last_pipeline.predict(test_X)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [3]:
output = pd.DataFrame({'id': test_X['id'],
                       'income': preds})
output

Unnamed: 0,id,income
0,50001,32257.798828
1,50002,24290.777344
2,50003,35694.187500
3,50004,22414.953125
4,50005,32482.945312
...,...,...
14995,64996,21468.076172
14996,64997,25508.445312
14997,64998,28111.898438
14998,64999,39637.992188


In [0]:
from google.colab import files

output.to_csv('O_216.csv', index=False)
files.download('O_216.csv')