# Черновик - содержит различные варианты

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

### 1) Читаем данные

In [2]:
data = pd.read_csv('input/train.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [4]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


### 2) Разбиваем на трейн и валид

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [7]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price'],
      dtype='object')

### 3) Пишем функции для обработки данных

In [8]:
def clean_rooms(df):
    df.loc[df['Rooms'] > 5, 'Rooms'] = 5
    return df

In [9]:
def clean_square(df):
    df.loc[df['Square'] < 15, 'Square'] = 15
    df['LifeSquare'] = df['LifeSquare'].fillna(df['Square'] - df['KitchenSquare'])
    df.loc[df['Square'] < df['LifeSquare'] + df['KitchenSquare'], 'Square'] = df['LifeSquare'] + df['KitchenSquare']
    return df

In [10]:
def clean_year(df):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = 2020
    return df

In [11]:
stat1 = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_dr'})
stat2 = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})
mean_price = train['Price'].mean()

In [12]:
def add_mean_price(df, stat1, stat2, mean_price):
    df = pd.merge(df, stat1, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, stat2, on=['Rooms'], how='left')
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df

In [13]:
def remove_strings(df):
    df = pd.get_dummies(df, columns = ['Ecology_2', 'Ecology_3', 'Shops_2'])
    return df

In [24]:
def floors_checking(df):
    df['HouseFloor'] = df['HouseFloor'].astype(int)
    df.loc[df['HouseFloor'] <= 0, 'HouseFloor'] = df['Floor'] + 1 # Не на последнем
    to_change = df['Floor'] > df['HouseFloor']
    df.loc[to_change, 'Floor'], df.loc[to_change, 'HouseFloor'] = df.loc[to_change, 'HouseFloor'], df.loc[to_change,'Floor']
    df['LastFloor'] = 0
    df.loc[df['Floor'] == df['HouseFloor'], 'LastFloor'] = 1
    return df

In [15]:
def prepare_data(df, stat1=stat1, stat2=stat2, mean_price=mean_price):
    df = clean_rooms(df)
    df = clean_square(df)
    df = clean_year(df)
    df = add_mean_price(df, stat1, stat2, mean_price)
    df = remove_strings(df)
    df = floors_checking(df)
    df = df.drop(columns='Healthcare_1')
    return df

### 4) Чистка данных

Я решил не удалять строки - всё равно из Теста их нельзя удалять

### 5) Применяем функции на трейне и валиде

In [17]:
train = prepare_data(train)
valid = prepare_data(valid)

### 6) Строим модель, делаем предсказание на трейне и валиде

In [18]:
from sklearn.ensemble import RandomForestRegressor as RF

In [19]:
model = RF(n_estimators=25, max_depth=10, random_state=42)

In [20]:
feats = ['Rooms', 'Square', 'HouseYear', 'mean_price_dr']

In [21]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

### 7) Проверяем качество модели

In [22]:
from sklearn.metrics import r2_score

In [23]:
def pred_evaluate_model(df, model=model, feats=feats, calculate_r2='Yes'):
    pred = model.predict(df.loc[:, feats])
    if calculate_r2 == 'Yes':
        r2 = r2_score(df['Price'], pred)
        print(f'R2: {r2}')
    return pred

In [24]:
def evaluate_model(df, model=model, feats=feats):
    pred = model.predict(df.loc[:, feats])
    return r2_score(df['Price'], pred)

In [25]:
pred_train = pred_evaluate_model(train)

R2: 0.8686079229633344


In [26]:
pred_valid = pred_evaluate_model(valid)

R2: 0.6619841359378945


### 7.5) Исследование, что лучше подойдёт

#### Линейная регрессия

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [28]:
def check_regression(feats, train=train, valid=valid, do_scalar=True):
    lr = LinearRegression()
    scaler = StandardScaler()
    X_train = train.loc[:, feats]
    y_train = train['Price']
    X_valid = valid.loc[:, feats]
    y_valid = valid['Price']
    if do_scalar:
        X_train = pd.DataFrame(data=scaler.fit_transform(X_train), columns=feats)
        X_valid = pd.DataFrame(data=scaler.transform(X_valid), columns=feats)
    lr.fit(X_train, y_train)
    X_train['Price'] = y_train
    X_valid['Price'] = y_valid
    return f"{evaluate_model(X_train, model=lr, feats=feats)}" \
           f"\t{evaluate_model(X_valid, model=lr, feats=feats)}" \
           f"\t{feats}\t{do_scalar}"

In [29]:
print(check_regression(['Rooms', 'Square', 'HouseYear', 'mean_price_r']))

0.3352179935572549	0.326627168546189	['Rooms', 'Square', 'HouseYear', 'mean_price_r']	True


In [30]:
from sklearn.ensemble import RandomForestRegressor as RF

In [31]:
def check_forest(feats, n_estimators, max_depth, train=train, valid=valid):
    model = RF(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(train.loc[:, feats], train['Price'])
    return f"{evaluate_model(train, model=model, feats=feats)}" \
           f"\t{evaluate_model(valid, model=model, feats=feats)}" \
           f"\t{n_estimators}\t{max_depth}"

In [32]:
def check_forests(feats, train=train, valid=valid):
    for i in range(50):
        for j in range(30):
            try:
                model = RF(n_estimators=i+1, max_depth=j+1, random_state=42)
                model.fit(train.loc[:, feats], train['Price'])
                print(f"{evaluate_model(train, model=model, feats=feats)}" \
                      f"\t{evaluate_model(valid, model=model, feats=feats)}" \
                      f"\t{i+1}\t{j+1}")
            except:
                print(f"Cant\t{i+1}\t{j+1}")
    

In [33]:
import itertools

In [34]:
possible_feats = list(train.columns.values)
possible_feats.remove('Price')
possible_feats.remove('Id')
possible_feats.remove('Healthcare_1')

In [65]:
train.corr()['Price']

Id               0.019332
DistrictId       0.264105
Rooms            0.570405
Square           0.107811
LifeSquare       0.068150
KitchenSquare    0.020301
Floor            0.126098
HouseFloor       0.089321
HouseYear        0.035236
Ecology_1       -0.064129
Social_1         0.265278
Social_2         0.240089
Social_3         0.076234
Helthcare_2      0.253601
Shops_1          0.183545
Price            1.000000
mean_price_dr    0.813599
mean_price_r     0.575864
Ecology_2_A     -0.030568
Ecology_2_B      0.030568
Ecology_3_A      0.051992
Ecology_3_B     -0.051992
Shops_2_A       -0.061129
Shops_2_B        0.061129
Name: Price, dtype: float64

In [131]:
possible_feats = train.corr()['Price'][abs(train.corr()['Price']) > 0.25].index
possible_feats = list(possible_feats)
possible_feats.remove('Price')

In [132]:
print(possible_feats)

['DistrictId', 'Rooms', 'Social_1', 'Helthcare_2', 'mean_price_dr', 'mean_price_r']


In [None]:
for i in range(len(possible_feats)):
    for j in list(itertools.combinations(possible_feats, i+1)):
        try:
            print(check_regression(j))
            print(check_regression(j, do_scalar=False))
        except:
            pass
    

Результат сохранил в эксель-файле

### Select K Best

In [76]:
from sklearn.feature_selection import SelectKBest, f_regression

In [77]:
train.drop(columns='Price').shape

(7000, 23)

In [78]:
train['Price'].shape

(7000,)

In [79]:
kbest_model = SelectKBest(score_func=f_regression, k=4)

In [80]:
kbest = kbest_model.fit(train.drop(columns='Price'), train['Price'])

In [83]:
scores = pd.DataFrame(kbest.scores_)
scores['Name'] = train.drop(columns='Price').columns.values

In [84]:
scores

Unnamed: 0,0,Name
0,2.616245,Id
1,524.722434,DistrictId
2,3374.97057,Rooms
3,82.296195,Square
4,32.653477,LifeSquare
5,2.885139,KitchenSquare
6,113.071648,Floor
7,56.281015,HouseFloor
8,8.699437,HouseYear
9,28.898702,Ecology_1


### RFE

In [31]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

In [32]:
model = LogisticRegression()

In [33]:
rfe = RFE(model, 6)

In [32]:
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(train.loc[:, 'Price'])

In [None]:
fit = rfe.fit(train.drop(columns='Price'), training_scores_encoded)

### PCA

In [33]:
from sklearn.decomposition import PCA

In [35]:
X_train = train.drop(columns='Price')
y_train = train.loc[:, 'Price']

In [34]:
def do_pca(pca_comp, train=train, valid=valid):
    pca_feats = list('abcdefghijklmnopqrstuvwxyz'[0:pca_comp])
    pca = PCA(n_components=pca_comp)
    fit = pca.fit(train.drop(columns='Price'))
    X_train_pca = fit.transform(train.drop(columns='Price'))
    X_train_pca = pd.DataFrame(X_train_pca, columns=pca_feats)
    X_train_pca['Price'] = train['Price']
    X_valid_pca = fit.transform(valid.drop(columns='Price'))
    X_valid_pca = pd.DataFrame(X_valid_pca, columns=pca_feats)
    X_valid_pca['Price'] = valid.loc[:, 'Price']
    check_forests(pca_feats, train=X_train_pca, valid=X_valid_pca)    

In [None]:
for i in range(15):
    do_pca(i)

В Excel файле на листе RFPCA

### 8) Применяем функции обработки данных, чистим тестовые данные

In [None]:
test = pd.read_csv('input/test.csv')
test = prepare_data(test)

### 9) Делаем предсказание на тесте

In [None]:
test['Price'] = pred_evaluate_model(test, calculate_r2='No')

### 10) Записываем предсказания в файл

In [None]:
test.loc[:, ['Id', 'Price']].to_csv('VVashchenkov_predictions.csv', index=None)