In [6]:
from catboost import  CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from tsfresh import extract_relevant_features
from tsfresh.examples.robot_execution_failures import load_robot_execution_failures
import geopandas as gpd
import numpy as np
import tqdm
import pandas as pd
import datetime
import requests
import matplotlib.pyplot as plt
from shapely.geometry import Point
from sklearn.metrics import mean_squared_error

In [7]:
def reduce_mem_usage(df: pd.DataFrame) -> pd.DataFrame:
    """ Проходит по всем столбцам DataFrame и изменяет тип данных
        для уменьшения использования памяти.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        
        # Пропускаем столбцы с типами object, category и datetime
        if col_type not in ['object', 'category', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Проверка на float
                if str(col_type)[:5] == 'float':
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
train = pd.read_csv('/kaggle/input/london-house-price-prediction-advanced-techniques/train.csv')
train = reduce_mem_usage(train)

Memory usage of dataframe is 34.54 MB
Memory usage after optimization is: 20.07 MB
Decreased by 41.9%


In [9]:
train = train.drop(columns=['ID'])
train

Unnamed: 0,fullAddress,postcode,country,outcode,latitude,longitude,bathrooms,bedrooms,floorAreaSqM,livingRooms,tenure,propertyType,currentEnergyRating,sale_month,sale_year,price
0,"38 Adelina Grove, London, E1 3AD",E1 3AD,England,E1,51.53125,-0.053253,,3.0,80.0,1.0,Freehold,Semi-Detached House,C,1,1995,77000
1,"6 Cleveland Grove, London, E1 4XL",E1 4XL,England,E1,51.53125,-0.053375,2.0,4.0,110.0,1.0,Leasehold,Terrace Property,D,1,1995,89995
2,"65 Sanderstead Road, London, E10 7PW",E10 7PW,England,E10,51.56250,-0.034882,1.0,3.0,84.0,1.0,Freehold,Terrace Property,D,1,1995,59000
3,"5 Queenswood Gardens, London, E11 3SE",E11 3SE,England,E11,51.56250,0.026291,,2.0,72.0,1.0,Leasehold,Purpose Built Flat,,1,1995,51500
4,"12 Woodlands Road, London, E11 4RW",E11 4RW,England,E11,51.56250,0.006260,1.0,3.0,104.0,1.0,Freehold,Mid Terrace House,D,1,1995,63500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266320,"Flat 5, 4 Acton Street, London, WC1X 9NA",WC1X 9NA,England,WC1X,51.53125,-0.116089,2.0,2.0,83.0,1.0,Leasehold,Flat/Maisonette,E,12,2023,800000
266321,"7 Spring House, Margery Street, London, WC1X 0HT",WC1X 0HT,England,WC1X,51.53125,-0.112000,1.0,2.0,69.0,1.0,Leasehold,Converted Flat,C,12,2023,550000
266322,"Flat 2, Goldsmith Court, Stukeley Street, Lond...",WC2B 5LF,England,WC2B,51.53125,-0.123596,1.0,2.0,50.0,1.0,Leasehold,Purpose Built Flat,C,12,2023,730000
266323,"Unit 205, 25 Floral Street, London, WC2E 9DS",WC2E 9DS,England,WC2E,51.50000,-0.125366,2.0,2.0,130.0,1.0,Leasehold,Flat/Maisonette,B,12,2023,3275000


In [10]:
train.fillna(-1, inplace=True)
y = train['price']
train = train.drop(columns=['price'])

In [11]:
train

Unnamed: 0,fullAddress,postcode,country,outcode,latitude,longitude,bathrooms,bedrooms,floorAreaSqM,livingRooms,tenure,propertyType,currentEnergyRating,sale_month,sale_year
0,"38 Adelina Grove, London, E1 3AD",E1 3AD,England,E1,51.53125,-0.053253,-1.0,3.0,80.0,1.0,Freehold,Semi-Detached House,C,1,1995
1,"6 Cleveland Grove, London, E1 4XL",E1 4XL,England,E1,51.53125,-0.053375,2.0,4.0,110.0,1.0,Leasehold,Terrace Property,D,1,1995
2,"65 Sanderstead Road, London, E10 7PW",E10 7PW,England,E10,51.56250,-0.034882,1.0,3.0,84.0,1.0,Freehold,Terrace Property,D,1,1995
3,"5 Queenswood Gardens, London, E11 3SE",E11 3SE,England,E11,51.56250,0.026291,-1.0,2.0,72.0,1.0,Leasehold,Purpose Built Flat,-1,1,1995
4,"12 Woodlands Road, London, E11 4RW",E11 4RW,England,E11,51.56250,0.006260,1.0,3.0,104.0,1.0,Freehold,Mid Terrace House,D,1,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266320,"Flat 5, 4 Acton Street, London, WC1X 9NA",WC1X 9NA,England,WC1X,51.53125,-0.116089,2.0,2.0,83.0,1.0,Leasehold,Flat/Maisonette,E,12,2023
266321,"7 Spring House, Margery Street, London, WC1X 0HT",WC1X 0HT,England,WC1X,51.53125,-0.112000,1.0,2.0,69.0,1.0,Leasehold,Converted Flat,C,12,2023
266322,"Flat 2, Goldsmith Court, Stukeley Street, Lond...",WC2B 5LF,England,WC2B,51.53125,-0.123596,1.0,2.0,50.0,1.0,Leasehold,Purpose Built Flat,C,12,2023
266323,"Unit 205, 25 Floral Street, London, WC2E 9DS",WC2E 9DS,England,WC2E,51.50000,-0.125366,2.0,2.0,130.0,1.0,Leasehold,Flat/Maisonette,B,12,2023


In [12]:
import re
def extract_street(address):
    # Используем регулярное выражение для поиска названия улицы
    match = re.search(r'(\d+\s+)(.*?)(,\s+.*)', address)
    if match:
        return match.group(2)  # Возвращаем только название улицы
    else:
        return None  # Если не найдено, возвращаем None

street_name = 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

def train_and_compare_models(X, y):
    """
    Обучает три модели (LightGBM, XGBoost и CatBoost) и сравнивает их по трем метрикам.

    :param X: Признаки (DataFrame или NumPy массив)
    :param y: Целевая переменная (Series или NumPy массив)
    :return: DataFrame с результатами сравнения моделей по метрикам
    """
    
    # Разделяем данные на обучающую и тестовую выборки
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Инициализация моделей
    models = {
        'LightGBM': lgb.LGBMClassifier(),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'CatBoost': CatBoostClassifier(silent=True)
    }

    # Словарь для хранения результатов
    results = {}

    # Обучение и оценка моделей
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Вероятности для ROC AUC

        # Вычисляем метрики
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        # Для RMSE используем вероятности предсказаний
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_proba))

        # Сохраняем результаты в словаре
        results[name] = {
            'Accuracy': accuracy,
            'F1 Score': f1,
            'ROC AUC': roc_auc,
            'RMSE': rmse
        }

    # Преобразуем результаты в DataFrame для удобного отображения
    results_df = pd.DataFrame(results).T

    return results_df

# Пример использования функции
# Предположим, что у вас есть DataFrame X с признаками и Series y с целевой переменной.
# results = train_and_compare_models(X, y)
# print(results)

In [4]:
Test = pd.read_csv('/kaggle/input/london-house-price-prediction-advanced-techniques/test.csv')
Test

Unnamed: 0,ID,fullAddress,postcode,country,outcode,latitude,longitude,bathrooms,bedrooms,floorAreaSqM,livingRooms,tenure,propertyType,currentEnergyRating,sale_month,sale_year
0,266325,"Flat 7, Philip House, Heneage Street, London, ...",E1 5LW,England,E1,51.518783,-0.071003,1.0,2.0,54.0,1.0,Leasehold,Flat/Maisonette,D,1,2024
1,266326,"17 Bowmans Mews, London, E1 8RY",E1 8RY,England,E1,51.511624,-0.068236,1.0,1.0,48.0,1.0,Leasehold,Purpose Built Flat,D,1,2024
2,266327,"Flat 30, Everard House, Boyd Street, London, E...",E1 1LY,England,E1,51.512737,-0.066502,1.0,,32.0,,Leasehold,Flat/Maisonette,C,1,2024
3,266328,"Flat 1, Wilton Court, Cavell Street, London, E...",E1 2BN,England,E1,51.516165,-0.057334,1.0,3.0,85.0,1.0,Leasehold,Purpose Built Flat,D,1,2024
4,266329,"45 Musbury Street, London, E1 0PJ",E1 0PJ,England,E1,51.515115,-0.051008,1.0,2.0,82.0,1.0,Leasehold,Flat/Maisonette,C,1,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16542,282867,"Flat 479, Russell Court, 3–16 Woburn Place, Lo...",WC1H 0NL,England,WC1H,51.523417,-0.126119,1.0,,24.0,1.0,Leasehold,Purpose Built Flat,C,8,2024
16543,282868,"Flat 479, Russell Court, 3–16 Woburn Place, Lo...",WC1H 0NL,England,WC1H,51.523417,-0.126119,1.0,,24.0,1.0,Leasehold,Purpose Built Flat,C,8,2024
16544,282869,"Flat 93, Clare Court, Judd Street, London, WC1...",WC1H 9QW,England,WC1H,51.526629,-0.123421,1.0,1.0,42.0,1.0,Leasehold,Flat/Maisonette,D,8,2024
16545,282870,"Flat 1, 28 King's Mews, London, WC1N 2JB",WC1N 2JB,England,WC1N,51.521686,-0.114010,,,,,,,B,8,2024
