Задание:
Используя данные из train.csv, построить
модель для предсказания цен на недвижимость (квартиры).
С помощью полученной модели предсказать
цены для квартир из файла test.csv.

Целевая переменная:
Price

Основная метрика:
R2 - коэффициент детерминации (sklearn.metrics.r2_score)

Вспомогательная метрика:
MSE - средняя квадратичная ошибка (sklearn.metrics.mean_squared_error)

### Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as r2, mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import GridSearchCV

import warnings

warnings.filterwarnings('ignore')

### Initialization

In [2]:
data_train = pd.read_csv('train.csv')
train, valid = train_test_split(data_train, test_size=0.1, random_state=128)

In [3]:
train.shape, valid.shape

((9000, 20), (1000, 20))

Анализируем наличие незаполненных значениях

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9000 entries, 8097 to 7251
Data columns (total 20 columns):
Id               9000 non-null int64
DistrictId       9000 non-null int64
Rooms            9000 non-null float64
Square           9000 non-null float64
LifeSquare       7117 non-null float64
KitchenSquare    9000 non-null float64
Floor            9000 non-null int64
HouseFloor       9000 non-null float64
HouseYear        9000 non-null int64
Ecology_1        9000 non-null float64
Ecology_2        9000 non-null object
Ecology_3        9000 non-null object
Social_1         9000 non-null int64
Social_2         9000 non-null int64
Social_3         9000 non-null int64
Healthcare_1     4682 non-null float64
Helthcare_2      9000 non-null int64
Shops_1          9000 non-null int64
Shops_2          9000 non-null object
Price            9000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.4+ MB


Смотрим корреляцию признаков с целевой переменной

In [5]:
train.corr()['Price']

Id               0.013966
DistrictId       0.266921
Rooms            0.567005
Square           0.541523
LifeSquare       0.076603
KitchenSquare    0.048317
Floor            0.128285
HouseFloor       0.092363
HouseYear        0.004493
Ecology_1       -0.059476
Social_1         0.264162
Social_2         0.238851
Social_3         0.073613
Healthcare_1     0.137576
Helthcare_2      0.253654
Shops_1          0.180100
Price            1.000000
Name: Price, dtype: float64

In [6]:
train.describe()


Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,9000.0,9000.0,9000.0,9000.0,7117.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,4682.0,9000.0,9000.0,9000.0
mean,8389.974111,50.582,1.888556,56.231602,37.180653,6.126778,8.527333,12.631889,4213.011,0.118603,24.767333,5366.418667,7.975333,1145.282785,1.318778,4.229222,214379.326346
std,4867.567914,43.738639,0.822591,20.34051,90.293949,21.327909,5.233892,6.78149,211345.8,0.118734,17.53374,4008.725416,23.587475,1017.840296,1.488125,4.782503,93233.241122
min,0.0,0.0,0.0,1.136859,0.641822,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4176.5,20.0,1.0,41.774881,22.812328,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153778.470631
50%,8391.5,35.5,2.0,52.462026,32.752586,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192511.807794
75%,12605.25,75.0,2.0,65.846466,45.054036,9.0,12.0,17.0,2001.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0,249970.954618
max,16798.0,209.0,10.0,604.705972,7480.592129,1970.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


### Function prepare data

Обработка Rooms и подготовка признаков со средними значениями цены и площадей по комнатам

In [7]:
# Формируем статистику средних цен, площадей в зависимости от комнат
# Отсекаем квартиры с 0 комнатами и больше 5 комнат
def get_stat_Rooms_mean(df):
    parameters = {'group_columns': ['Rooms'], 
               'mean_columns': ['Square', 'LifeSquare', 'KitchenSquare', 'Price'],
               'mean_columns_rename': {'Square': 'mean_Square_r', 'LifeSquare': 'mean_LifeSquare_r',
                                       'KitchenSquare': 'mean_KitchenSquare_r', 'Price': 'mean_price_r' }
             }
    spam = df.groupby(['Rooms'], as_index=False)['Square', 'LifeSquare', 'KitchenSquare', 'Price'].mean().rename(
                                        columns = {'Square': 'mean_Square_r', 'LifeSquare': 'mean_LifeSquare_r',
                                       'KitchenSquare': 'mean_KitchenSquare_r', 'Price': 'mean_price_r' })
    spam = spam.loc[(spam['Rooms'] > 0)&(spam['Rooms'] < 6)]

    return spam
 

In [8]:
# Все квартиры с 0 комнат относим по площади к квартирам с большой количеством комнат
# Если количестов комнат больше 5, а площадь малая, то относим по площади к квартирам с меньшим количеством комнат
# Оставшиеся квартиры с 0 комнат относим к 1-комнатным
# Оставшиеся квартиры с количестовом комнат больше 5 оставляем как есть
def prepare_Rooms(df,  stat_r_mean):
    spam = df.loc[(df['Rooms'] == 0) |(df['Rooms'] > 5)]
    for eggs in stat_r_mean.itertuples():
        spam.loc[(
                    (spam['Square']>eggs[2]) &(spam['Rooms']<eggs[1])
                )|(
                    (spam['Square']<eggs[2]) &(spam['Rooms']>eggs[1])
                )
            , 'Rooms'] = eggs[1]


    df.loc[(df['Rooms'] == 0) | (df['Rooms'] > 5)] = spam  
    df.loc[(df['Rooms'] == 0), 'Rooms'] = 1
    df.loc[(df['Rooms']  > 5), 'Rooms'] = 5
    return df

Функции, обрабатывающие Square, LifeSquare, KitchenSquare, убирающие выбросы больших, малых значений

In [9]:
### Заполняем незаполненную жилую площадь
def fillna_LifeSqure(df):
    df['LifeSquare'] = df['LifeSquare'].fillna(df['Square'] - df['KitchenSquare'])
    return df

In [10]:
# Готовим данные по площади  
def clean_Square(df):
    # Меняем местмами жилую и общую площадь, если жилая больше общей
    df[['Square','LifeSquare']] = df[['LifeSquare','Square']].where(
            df['LifeSquare'] > df['Square'], df[['Square','LifeSquare']].values)
    
    # задаем средние значения для небольших площадей
    df.loc[df['Square'] < 15, 'Square'] = df.loc[df['Square'] < 15, 'mean_Square_r'] 
    df.loc[df['KitchenSquare'] < 4, 'KitchenSquare'] = df.loc[df['KitchenSquare'] < 4, 'mean_KitchenSquare_r']
    df.loc[df['LifeSquare'] < 10, 'LifeSquare'] = df.loc[df['LifeSquare'] < 10, 'mean_LifeSquare_r']
    
    # задаем средние значения для больших площадей
    df.loc[df ['Square'] > 400,'Square'] =  df.loc[df ['Square'] > 400, 'mean_Square_r']
    df.loc[df ['LifeSquare'] > 400,'Square'] =  df.loc[df ['LifeSquare'] > 400, 'mean_Square_r']
    df.loc[df ['KitchenSquare'] + df ['LifeSquare'] > df['Square'], 'KitchenSquare'] = df.loc[df ['KitchenSquare'] + df ['LifeSquare']  > df['Square'], 'mean_KitchenSquare_r']
    
    return df

Функции добавления признака средней цены по району и количеству комнат

In [11]:
# Получаем статистиу о средней цене по району_комнатам
def get_stat_District_Rooms_means(df):
    parameters = {'group_columns': ['DistrictId', 'Rooms'], 
               'mean_columns': ['Price'],
               'mean_columns_rename': {'Price': 'mean_price_dr' }
             }
    spam = df.groupby(['DistrictId', 'Rooms'], as_index=False)['Price'].mean().rename(columns={'Price': 'mean_price_dr' })

    return spam

In [12]:
# Добавляем среднее значение цены по району и количеству комнат, а также mean Square
def prepare_mean_price_square(df, stat_dr_mean, stat_r_mean, stat):
    df = pd.merge(df, stat_dr_mean, on = ['DistrictId', 'Rooms'] , how='left')
    df = pd.merge(df, stat_r_mean, on = ['Rooms'], how='left')
    df['mean_price_r'] = df['mean_price_r'].fillna(stat['mean_price'])
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    
    
    df['mean_Square_r'] = df['mean_Square_r'].fillna(stat['mean_Square'])
    df['mean_LifeSquare_r'] = df['mean_LifeSquare_r'].fillna(stat['mean_LifeSquare'])
    df['mean_KitchenSquare_r'] = df['mean_KitchenSquare_r'].fillna(stat['mean_KitchenSquare'])
    return df

Функция подготовки признака Year

In [13]:
# Убираем явные выбросы после 2020 года
def prepare_year_after2020(df):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = df.loc[df['HouseYear'] > 2020, 'HouseYear'] % 2021
    return df

Функции подготовки HouseFloor, Floor

In [14]:
# Приводит этажность дома равную 0 к этажу квартиры
def prepare_HouseFloor(df):
    df.loc[df['HouseFloor'] == 0, 'HouseFloor'] = df['Floor'] 
    return df

In [15]:
# Если этаж квартиры меньше этажности дома, то меняем местами
def swap_HouseFloor_Floor(df):
    df[['HouseFloor','Floor']] = df[['Floor','HouseFloor']].where(
        df['Floor'] > df['HouseFloor'], df[['HouseFloor','Floor']].values) 
    return df

Работа с категориальными признаками

In [16]:
### Заполнение пропущенных Healthcare_1
def fillna_Healthcare_1(df):
    df['Healthcare_1'] = df['Healthcare_1'].fillna(0)
    return df

In [17]:
# Преобразование категориальных признаков в бинарные
# При анализе сильно ухудшили метрики и были исключены
def create_dummies(df, train_columns):
    df = pd.get_dummies(df)
    missing_cols = set( train_columns ) - set( df.columns )
    for c in missing_cols:
        df[c] = 0
    return df

In [18]:
# Формирование статистики для каждого категориального признака частоты в тренировочной выборке
def get_stat_ratio(df):
    spam = {}
    for item in categorical_columns():  
        distr_info = df[item].value_counts(normalize=True).reset_index().rename(columns={'index':item, 
                                                                               item:'ratio_'+item})
        spam[item] = {'group_columns': item, 
                      'ratio_columns': 'ratio_'+item,
                      'ratio_value':distr_info}
                      
      
    return spam

In [19]:
# Формирование статистики для каждого категориального признака средней цены в тренировочной выборке
def get_stat_mean(df):
    spam = {}
    for item in categorical_columns():  
        distr_info = df.groupby([item], as_index=False)['Price'].mean().rename(columns={'Price': 'mean_price_'+item})
        item_mean = df[item].mean()
        spam[item] = {'group_columns': item, 
                      'mean_columns': 'mean_price_'+item,
                      'mean_value':distr_info,
                       'mean_item': item_mean}
                      
      
    return spam

In [20]:
# Добавление признака частоты категориальных признаков
def add_ratio(df, stat):
    ratio_stat = stat['stat_ratio'] 
    for eggs in categorical_columns():    
        spam_stat = ratio_stat[eggs] 
        df = pd.merge(df, spam_stat['ratio_value'], on=spam_stat['group_columns'], how='left')
        df[spam_stat['ratio_columns']] = df[spam_stat['ratio_columns']].fillna(0)
        df[spam_stat['ratio_columns']] = df[spam_stat['ratio_columns']].astype(float)
    return df

In [21]:
# Добавление признака средней цены категориальных признаков
def add_mean_price_cat_columns(df, stat):
    ratio_stat = stat['stat_mean'] 
    for eggs in categorical_columns():    
        spam_stat = ratio_stat[eggs] 
        df = pd.merge(df, spam_stat['mean_value'], on=spam_stat['group_columns'], how='left')
        df[spam_stat['mean_columns']] = df[spam_stat['mean_columns']].fillna(spam_stat['mean_item'])
    return df

In [22]:
# Возвращает список категориальных признаков, найденых при анализе
def categorical_columns():
     return ['DistrictId', 'Social_1', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1']

In [23]:
# Возвращает список бинарных категориальных признаков, найденых при анализе
def binary_columns():
     return ['Ecology_2', 'Ecology_3', 'Shops_2']

In [24]:
# Возвращает список некатегориальных столбцов
def get_noncategorical_columns(df):
    return [c for c in df.columns if df[c].dtype.name != 'object']

In [25]:
# Приводит значения в столбцах из числового в категориальный
def type_to_categorical(df):
    for column_name in categorical_columns():
        df[column_name] = df[column_name].astype(object)
    return df

In [26]:
# Преобразует значения в бинарных категориальных столбцах в [0,1]
def prepare_binary_columns(df, cat_fts=binary_columns()):
    for col in cat_fts:
        df[col] = (df[col] == 'B').astype(int)
    return df

### Prepare data

Формируем статистики на тренировочной выборке

In [27]:
stat = {}
stat_r_mean = get_stat_Rooms_mean(train)
stat_dr_mean = get_stat_District_Rooms_means(train)
stat['mean_price'] = train['Price'].mean()
stat['mean_Square'] = train['Square'].mean()
stat['mean_LifeSquare'] = train['LifeSquare'].mean()
stat['mean_KitchenSquare'] = train['KitchenSquare'].mean()
stat['stat_ratio'] = get_stat_ratio(train)
stat['stat_mean'] = get_stat_mean(train)

Функция подготовки данных

In [28]:
def prepare_data(df, stat_r_mean = stat_r_mean, stat_dr_mean = stat_dr_mean, stat = stat, train_columns = train.columns):
    df = fillna_LifeSqure(df)
    df = fillna_Healthcare_1(df)
    
    df = prepare_Rooms(df, stat_r_mean)
    df = prepare_mean_price_square(df, stat_dr_mean, stat_r_mean, stat)
    df = prepare_year_after2020(df)
    df = clean_Square(df)
    df = prepare_HouseFloor(df)
    df = swap_HouseFloor_Floor(df)
    
    df = add_ratio(df, stat)
    df = add_mean_price_cat_columns(df, stat)
    df = type_to_categorical(df)
    df = prepare_binary_columns(df)
    df['Social_2'] = df['Social_2'].astype(int)
    # df = create_dummies(df, train_columns)
    return df

In [29]:
train = prepare_data(train)
valid = prepare_data(valid)

### Create Model

Формирование широкого списка признаков для расчета модели

In [30]:
feats = get_noncategorical_columns(train)
feats.remove('Price')
feats.remove('Id')

Проверка, что подготовка данных не удалили строки и не появились неопределенные значения

In [31]:
train[feats].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9000 entries, 0 to 8999
Data columns (total 29 columns):
Rooms                      9000 non-null float64
Square                     9000 non-null float64
LifeSquare                 9000 non-null float64
KitchenSquare              9000 non-null float64
Floor                      9000 non-null float64
HouseFloor                 9000 non-null int64
HouseYear                  9000 non-null int64
Ecology_1                  9000 non-null float64
Ecology_2                  9000 non-null int32
Ecology_3                  9000 non-null int32
Social_2                   9000 non-null int32
Shops_2                    9000 non-null int32
mean_price_dr              9000 non-null float64
mean_Square_r              9000 non-null float64
mean_LifeSquare_r          9000 non-null float64
mean_KitchenSquare_r       9000 non-null float64
mean_price_r               9000 non-null float64
ratio_DistrictId           9000 non-null float64
ratio_Social_1         

In [32]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 37 columns):
Id                         1000 non-null int64
DistrictId                 1000 non-null object
Rooms                      1000 non-null float64
Square                     1000 non-null float64
LifeSquare                 1000 non-null float64
KitchenSquare              1000 non-null float64
Floor                      1000 non-null float64
HouseFloor                 1000 non-null int64
HouseYear                  1000 non-null int64
Ecology_1                  1000 non-null float64
Ecology_2                  1000 non-null int32
Ecology_3                  1000 non-null int32
Social_1                   1000 non-null object
Social_2                   1000 non-null int32
Social_3                   1000 non-null object
Healthcare_1               1000 non-null object
Helthcare_2                1000 non-null object
Shops_1                    1000 non-null object
Shops_2                    100

In [33]:
# Получение списка признаков, отсортированных по важности 
def get_importance(model, feats):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    return [feats[i] for i in indices]

In [34]:
# Проверка модели
# train, valid - DataFrames
# feats - columns
# isUseBestFindModel: нужно ли использовать ранее найденную модель или искать новую с помощью GridSearchCV

def check_model(train, valid, feats, isUseBestFindModel = True):
    if isUseBestFindModel:
        model = RF(random_state=128, n_estimators=200, max_depth=11, max_features= 6, min_samples_leaf=2)
        model.fit(train.loc[:, feats], train['Price'])
    else:
        parameters = [{'n_estimators': [100, 200, 300], 
               'max_features': np.arange(5, 9),
               'max_depth': np.arange(10, 14),
               'min_samples_leaf': np.arange(2, 5)
             }]
        model = GridSearchCV(estimator=RF(random_state=128),
                   param_grid=parameters,
                   cv=5)
        model.fit(train.loc[:, feats], train['Price'])
        print(model.best_params_)                                
        model = model.best_estimator_                                 
    
    pred_train = model.predict(train.loc[:, feats])
    pred_valid = model.predict(valid.loc[:, feats])
    metric_train = r2(train['Price'], pred_train), mse(train['Price'], pred_train)
    metric_valid = r2(valid['Price'], pred_valid), mse(valid['Price'], pred_valid)
    
    return metric_train, metric_valid, model

Лучшие параметры:
{'max_depth': 11,
 'max_features': 6,
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [35]:
# Поиск наиболее подходящих признаков для модели. Критерий r2 на valid
def find_best_feats(train, valid, feats, isUseBestFindModel = True):
    best_feats = []
    best_metric_train = None
    best_metric_valid = None
    best_model = None
    
    cur_feats = feats
    cur_metric_train = None
    cur_metric_valid = None
    cur_model = None
    
    cur_metric_train, cur_metric_valid, cur_model = check_model(train, valid, cur_feats, isUseBestFindModel) 
    best_metric_train, best_metric_valid, best_model = cur_metric_train, cur_metric_valid, cur_model
    importance_feats = get_importance(best_model, feats)
    for item in importance_feats:        
        cur_feats.remove(item)
        if len(cur_feats) > 0:
            cur_metric_train, cur_metric_valid, cur_model = check_model(train, valid, cur_feats, isUseBestFindModel) 
            if (best_metric_valid[0] < cur_metric_valid[0]):
                # print(f'{cur_metric_valid}:{cur_feats}')
                best_metric_train, best_metric_valid, best_model = cur_metric_train, cur_metric_valid, cur_model
                best_feats = cur_feats
            else:
                cur_feats.append(item)
    return best_metric_train, best_metric_valid, best_model, best_feats

In [36]:
metric_train, metric_valid, model, feats = find_best_feats(train, valid, feats, True)

Отсортированные по важности признаки лучшей модели

In [37]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature importances:")
for f, idx in enumerate(indices):
    print("{:2d}. feature '{:5s}' ({:.4f})".format(f + 1, feats[idx], importances[idx]))

Feature importances:
 1. feature 'Rooms' (0.2513)
 2. feature 'LifeSquare' (0.1261)
 3. feature 'ratio_DistrictId' (0.0988)
 4. feature 'mean_price_Social_1' (0.0935)
 5. feature 'HouseYear' (0.0854)
 6. feature 'KitchenSquare' (0.0739)
 7. feature 'mean_Square_r' (0.0533)
 8. feature 'mean_price_Healthcare_1' (0.0482)
 9. feature 'Ecology_1' (0.0347)
10. feature 'HouseFloor' (0.0271)
11. feature 'ratio_Social_3' (0.0237)
12. feature 'ratio_Social_1' (0.0234)
13. feature 'ratio_Shops_1' (0.0157)
14. feature 'ratio_Healthcare_1' (0.0155)
15. feature 'Shops_2' (0.0150)
16. feature 'mean_price_Helthcare_2' (0.0135)
17. feature 'Ecology_2' (0.0010)
18. feature 'Square' (0.0000)


Метрики модели на train и valid

In [38]:
metric_train, metric_valid

((0.8633819053943148, 1187412265.2237215),
 (0.7383523085489845, 2097252069.321771))

Размеры не изменились

In [39]:
train.shape, valid.shape

((9000, 37), (1000, 37))

In [40]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9000 entries, 0 to 8999
Data columns (total 37 columns):
Id                         9000 non-null int64
DistrictId                 9000 non-null object
Rooms                      9000 non-null float64
Square                     9000 non-null float64
LifeSquare                 9000 non-null float64
KitchenSquare              9000 non-null float64
Floor                      9000 non-null float64
HouseFloor                 9000 non-null int64
HouseYear                  9000 non-null int64
Ecology_1                  9000 non-null float64
Ecology_2                  9000 non-null int32
Ecology_3                  9000 non-null int32
Social_1                   9000 non-null object
Social_2                   9000 non-null int32
Social_3                   9000 non-null object
Healthcare_1               9000 non-null object
Helthcare_2                9000 non-null object
Shops_1                    9000 non-null object
Shops_2                    90

In [41]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 37 columns):
Id                         1000 non-null int64
DistrictId                 1000 non-null object
Rooms                      1000 non-null float64
Square                     1000 non-null float64
LifeSquare                 1000 non-null float64
KitchenSquare              1000 non-null float64
Floor                      1000 non-null float64
HouseFloor                 1000 non-null int64
HouseYear                  1000 non-null int64
Ecology_1                  1000 non-null float64
Ecology_2                  1000 non-null int32
Ecology_3                  1000 non-null int32
Social_1                   1000 non-null object
Social_2                   1000 non-null int32
Social_3                   1000 non-null object
Healthcare_1               1000 non-null object
Helthcare_2                1000 non-null object
Shops_1                    1000 non-null object
Shops_2                    100

##### Result:

In [42]:
test = pd.read_csv('test.csv')

In [43]:
test = prepare_data(test)

In [44]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 36 columns):
Id                         5000 non-null int64
DistrictId                 5000 non-null object
Rooms                      5000 non-null float64
Square                     5000 non-null float64
LifeSquare                 5000 non-null float64
KitchenSquare              5000 non-null float64
Floor                      5000 non-null float64
HouseFloor                 5000 non-null int64
HouseYear                  5000 non-null int64
Ecology_1                  5000 non-null float64
Ecology_2                  5000 non-null int32
Ecology_3                  5000 non-null int32
Social_1                   5000 non-null object
Social_2                   5000 non-null int32
Social_3                   5000 non-null object
Healthcare_1               5000 non-null object
Helthcare_2                5000 non-null object
Shops_1                    5000 non-null object
Shops_2                    50

In [45]:
test['Price'] = model.predict(test.loc[:, feats])

In [46]:
test.loc[:, ['Id', 'Price']].to_csv('SSurname.csv', index=None)