In [3]:
import torch
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Loading data

In [4]:
train_data = pd.read_csv("C://Users/н/Downloads/train.csv")
test_data = pd.read_csv("C://Users/н/Downloads/test.csv")

In [5]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Preprocessing

In [6]:
for data in train_data, test_data:
    for col in data.columns:
        if data[col].isna().sum() / data.shape[0] > 0.8:
            data.drop(columns=col, inplace=True)

In [7]:
y_train = train_data['SalePrice'].astype('float64')
train_data.drop(columns=['SalePrice', 'Id'], inplace=True)

In [8]:
id_col = test_data['Id']
test_data.drop(columns='Id', inplace=True)

In [9]:
X_train = pd.get_dummies(train_data, drop_first=True, dtype='float64').fillna(0)
X_test = pd.get_dummies(test_data, drop_first=True, dtype='float64').fillna(0)

In [10]:
for col in X_train.columns:
    if col not in X_test.columns:
        X_train.drop(columns=col, inplace=True)
        

## Simple RandomForestRegressor


In [38]:
from sklearn.ensemble import RandomForestRegressor

### Fitting & getting results

In [17]:
rf = RandomForestRegressor(n_estimators=300, max_depth=220, n_jobs=-1)
rf.fit(X_train, y_train)
rf_predict = rf.predict(X_test)

In [18]:
pd.DataFrame({'Id': id_col, 'SalePrice': rf_predict}).to_csv('submission.csv', index=None, header=True)

## CatBoost

In [11]:
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split

In [12]:
X_train_bst, X_test_bst, y_train_bst, y_test_bst = train_test_split(X_train, y_train, train_size=0.7)

### Fitting & getting results

In [13]:
model = CatBoostRegressor(random_state=1,
                         learning_rate=0.1,
                         use_best_model=True)

In [15]:
model.fit(np.array(X_train, dtype='float'), 
          np.array(y_train, dtype='float'), 
          eval_set=(X_test_bst, y_test_bst),
          verbose=10) 

0:	learn: 74306.2653485	test: 74684.6597798	best: 74684.6597798 (0)	total: 189ms	remaining: 3m 8s
10:	learn: 43953.2640387	test: 46025.4352840	best: 46025.4352840 (10)	total: 317ms	remaining: 28.5s
20:	learn: 31404.7233269	test: 33888.7622726	best: 33888.7622726 (20)	total: 437ms	remaining: 20.4s
30:	learn: 25457.7730602	test: 27688.1770245	best: 27688.1770245 (30)	total: 560ms	remaining: 17.5s
40:	learn: 22591.5133995	test: 24453.0816761	best: 24453.0816761 (40)	total: 699ms	remaining: 16.4s
50:	learn: 20759.4587279	test: 22542.3055800	best: 22542.3055800 (50)	total: 769ms	remaining: 14.3s
60:	learn: 19297.1354211	test: 20639.7803704	best: 20639.7803704 (60)	total: 900ms	remaining: 13.8s
70:	learn: 18245.9995742	test: 19318.3217709	best: 19318.3217709 (70)	total: 1.02s	remaining: 13.3s
80:	learn: 17441.3928733	test: 18432.8278538	best: 18432.8278538 (80)	total: 1.19s	remaining: 13.5s
90:	learn: 16858.2205792	test: 17819.9019508	best: 17819.9019508 (90)	total: 1.33s	remaining: 13.3s
10

830:	learn: 3150.3886676	test: 3171.0943607	best: 3171.0943607 (830)	total: 10s	remaining: 2.04s
840:	learn: 3083.2808754	test: 3102.2667951	best: 3102.2667951 (840)	total: 10.2s	remaining: 1.93s
850:	learn: 3036.6450812	test: 3063.8202713	best: 3063.8202713 (850)	total: 10.3s	remaining: 1.8s
860:	learn: 2969.7670229	test: 2999.8324619	best: 2999.8324619 (860)	total: 10.4s	remaining: 1.68s
870:	learn: 2911.1547836	test: 2927.3753167	best: 2927.3753167 (870)	total: 10.6s	remaining: 1.57s
880:	learn: 2865.7332161	test: 2879.0682098	best: 2879.0682098 (880)	total: 10.7s	remaining: 1.44s
890:	learn: 2817.7941153	test: 2837.5295091	best: 2837.5295091 (890)	total: 10.8s	remaining: 1.32s
900:	learn: 2771.9883098	test: 2783.0197647	best: 2783.0197647 (900)	total: 10.9s	remaining: 1.2s
910:	learn: 2715.1916961	test: 2725.3256496	best: 2725.3256496 (910)	total: 11.1s	remaining: 1.08s
920:	learn: 2668.7685409	test: 2676.8233653	best: 2676.8233653 (920)	total: 11.2s	remaining: 959ms
930:	learn: 26

<catboost.core.CatBoostRegressor at 0x273ebfc9090>

In [16]:
y_pred = model.predict(np.array(X_test, dtype='float'))

In [17]:
pd.DataFrame({'Id': id_col, 'SalePrice': y_pred}).to_csv('submission.csv', index=None, header=True)