In [29]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pylab import rcParams
from scipy import stats
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [30]:
X = pd.read_csv("2019_08_19 train_X.csv", sep=",")
y = pd.read_csv("2019_08_19 train_y.csv", sep=",")

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)
model.fit(X_train, y_train.values[:, 0])
y_pred = model.predict(X_test)
full_set_err = r2_score(y_test, y_pred)
full_set_err

0.7145469713569776

In [32]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8869 entries, 0 to 8868
Data columns (total 18 columns):
Id               8869 non-null int64
DistrictId       8869 non-null int64
Rooms            8869 non-null float64
Square           8869 non-null float64
LifeSquare       8869 non-null float64
KitchenSquare    8869 non-null float64
Floor            8869 non-null int64
HouseFloor       8869 non-null float64
HouseYear        8869 non-null int64
Ecology_1        8869 non-null float64
Ecology_2        8869 non-null int64
Ecology_3        8869 non-null int64
Social_1         8869 non-null int64
Social_2         8869 non-null int64
Social_3         8869 non-null int64
Helthcare_2      8869 non-null int64
Shops_1          8869 non-null int64
Shops_2          8869 non-null int64
dtypes: float64(6), int64(12)
memory usage: 1.2 MB


In [33]:
def feature_test(X, y, best_r2):
    max_r2 = 0
    max_r2_ind = 0
    new_best_r2 = best_r2
    for ind in range(0, X.shape[1]):
        X_drop = X.drop(X.keys()[ind], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X_drop, y, test_size=0.3, random_state=42)
        model = RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)
        model.fit(X_train, y_train.values[:, 0])
        y_pred = model.predict(X_test)
        temp_r2 = r2_score(y_test, y_pred)
        feature_score = temp_r2 - best_r2
        if feature_score > max_r2:
            new_best_r2 = temp_r2
            max_r2 = feature_score
            max_r2_ind = ind
        print("{} {}: {} r2: {}".format((ind + 1), X.keys()[ind], feature_score, temp_r2))
    if max_r2:
        print("drop: {}  r2: {}".format(X.keys()[max_r2_ind], max_r2))
        X.drop(X.keys()[max_r2_ind], axis=1, inplace=True)
        feature_test(X, y, new_best_r2)
    else:
        return new_best_r2


In [34]:
new_r2 = feature_test(X, y, full_set_err)
print(new_r2)

1 Id: 0.0016768647803856451 r2: 0.7162238361373633
2 DistrictId: -0.009161515314151991 r2: 0.7053854560428257
3 Rooms: -0.0015116012188902683 r2: 0.7130353701380874
4 Square: -0.017321666255808 r2: 0.6972253051011696
5 LifeSquare: 0.001378084033108351 r2: 0.715925055390086
6 KitchenSquare: -0.0015854447750178746 r2: 0.7129615265819598
7 Floor: -0.0015002723453709788 r2: 0.7130466990116067
8 HouseFloor: -0.0009616165371592045 r2: 0.7135853548198184
9 HouseYear: -0.0012940774868724914 r2: 0.7132528938701052
10 Ecology_1: -0.010995197205564633 r2: 0.703551774151413
11 Ecology_2: -0.0007677372292871354 r2: 0.7137792341276905
12 Ecology_3: -0.0004819407797096442 r2: 0.714065030577268
13 Social_1: -0.0009338736965572858 r2: 0.7136130976604204
14 Social_2: -0.0027553984282279176 r2: 0.7117915729287497
15 Social_3: -0.010729043271868965 r2: 0.7038179280851087
16 Helthcare_2: -0.000912137894621079 r2: 0.7136348334623566
17 Shops_1: -0.0007752348340235304 r2: 0.7137717365229541
18 Shops_2: -0.00

In [35]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8869 entries, 0 to 8868
Data columns (total 16 columns):
DistrictId       8869 non-null int64
Rooms            8869 non-null float64
Square           8869 non-null float64
KitchenSquare    8869 non-null float64
Floor            8869 non-null int64
HouseFloor       8869 non-null float64
HouseYear        8869 non-null int64
Ecology_1        8869 non-null float64
Ecology_2        8869 non-null int64
Ecology_3        8869 non-null int64
Social_1         8869 non-null int64
Social_2         8869 non-null int64
Social_3         8869 non-null int64
Helthcare_2      8869 non-null int64
Shops_1          8869 non-null int64
Shops_2          8869 non-null int64
dtypes: float64(5), int64(11)
memory usage: 1.1 MB


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)
model.fit(X_train, y_train.values[:, 0])
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
r2

0.7176874530046737