In [88]:
import pandas as pd

df_train = pd.read_csv('data/train.tsv', sep='\t')
df_test = pd.read_csv('data/test.tsv', sep='\t')
reviews = pd.read_csv('data/reviews.tsv', sep='\t')

In [89]:
pd.set_option('display.float_format', '{:.5f}'.format)

# Preprocessing

In [90]:
df_train = df_train[df_train['target'] > 0.9]

In [91]:
from ast import literal_eval

def parse_cords(coord_str):
    coords = literal_eval(coord_str)
    return float(coords[0]), float(coords[1])

def create_cords_features(df):
    df['longitude'] = df['coordinates'].apply(lambda x: parse_cords(x)[0])
    df['latitude'] = df['coordinates'].apply(lambda x: parse_cords(x)[1])
    df = df.drop('coordinates', axis=1)
    return df

df_test = create_cords_features(df_test)
df_train = create_cords_features(df_train)

In [92]:
df_train = df_train[
    ~(
        (df_train['latitude'] > 56.25) |   # —à–∏—Ä–æ—Ç–∞ –≤—ã—à–µ –ú–æ—Å–∫–≤—ã
        (df_train['longitude'] > df_test['longitude'].max())    # –¥–æ–ª–≥–æ—Ç–∞ –∑–∞–ø–∞–¥–Ω–µ–µ –ú–æ—Å–∫–≤—ã
    )
]

In [93]:
reviews['text_len'] = reviews['text'].apply(lambda x: len(x))
reviews.head(3)

Unnamed: 0,id,text,text_len
0,43591,"–ú–∞—Ä–º–µ–ª–∞–¥ –≤ —Ü–µ–ª–æ–º –Ω–µ–ø–ª–æ—Ö–æ–π, –Ω–æ —Ü–µ–Ω—ã –∑–∞–≤—ã—à–µ–Ω—ã, –º...",83
1,43591,"–ù–µ –Ω—Ä–∞–≤–∏—Ç—Å—è, —á—Ç–æ —Ç–æ–≤–∞—Ä –≤—ã–ª–æ–∂–µ–Ω –æ—Ç–∫—Ä—ã—Ç–æ, —Å–ª–∏—à–∫–æ...",99
2,43591,"–ß–∞—Å—Ç–æ –ø–æ–ø–∞–¥–∞–µ—Ç—Å—è —Å—É—Ö–æ–π –º–∞—Ä–º–µ–ª–∞–¥, –¥—É–±–æ–≤—ã–π –≤–ø–µ—Ä–µ...",84


In [94]:
reviews_by_id = reviews.groupby('id')['text_len'].agg(['count', 'mean', 'std']).reset_index()\
.rename(columns={'count': 'reviews_count', 'mean': 'reviews_mean', 'std': 'reviews_std'})
reviews_by_id.head(3)

Unnamed: 0,id,reviews_count,reviews_mean,reviews_std
0,1,20,132.7,75.40215
1,2,10,172.7,78.26174
2,3,10,179.7,110.97652


In [95]:
base_names = (
    df_train.columns
    .str.replace(r'_300m$', '', regex=True)
    .str.replace(r'_1000m$', '', regex=True)
)
unique_base_names = base_names.drop_duplicates().tolist()

In [96]:
import pandas as pd
import geopandas as gpd
import osmnx as ox
from shapely.geometry import Point
from sklearn.neighbors import NearestNeighbors
import numpy as np

def add_geo_features(df):
    gdf = gpd.GeoDataFrame(df, geometry=[Point(xy) for xy in zip(df['longitude'], df['latitude'])], crs='EPSG:4326')
    gdf_m = gdf.to_crs('EPSG:32637')  # –≤ –º–µ—Ç—Ä–∞—Ö
    X_points = gdf_m.geometry.apply(lambda p: [p.x, p.y]).tolist()

    center = gpd.GeoDataFrame(geometry=[Point(37.6173, 55.7539)], crs='EPSG:4326').to_crs('EPSG:32637')
    center_point = center.geometry.iloc[0]
    df['dist_to_center'] = gdf_m.distance(center_point)

    transport = pd.concat([
        ox.features_from_place('Moscow, Russia', tags={'aeroway': 'aerodrome'}),
        ox.features_from_place('Moscow, Russia', tags={'railway': 'station'}),
        ox.features_from_place('Moscow, Russia', tags={'amenity': 'ferry_terminal'})
    ], ignore_index=True).to_crs('EPSG:32637')

    transport['geometry'] = transport.geometry.apply(
        lambda g: g.centroid if g.geom_type in ['Polygon', 'MultiPolygon'] else g
    )
    if len(transport) > 0:
        X_trans = transport.geometry.apply(lambda p: [p.x, p.y]).tolist()
        nbrs = NearestNeighbors(n_neighbors=1).fit(X_trans)
        df['dist_to_transport_hub'] = nbrs.kneighbors(X_points)[0].flatten()
    else:
        df['dist_to_transport_hub'] = np.nan

    roads = ox.graph_from_place('Moscow, Russia', network_type='drive', custom_filter='["highway"~"primary|secondary|trunk"]')
    intersections = ox.graph_to_gdfs(roads, nodes=True, edges=False).to_crs('EPSG:32637')
    major_inters = intersections[intersections['street_count'] >= 4]
    if len(major_inters) > 0:
        X_inter = major_inters.geometry.apply(lambda p: [p.x, p.y]).tolist()
        nbrs = NearestNeighbors(n_neighbors=1).fit(X_inter)
        df['dist_to_major_intersection'] = nbrs.kneighbors(X_points)[0].flatten()
    else:
        df['dist_to_major_intersection'] = np.nan

    industrial = ox.features_from_place('Moscow, Russia', tags={'landuse': 'industrial'}).to_crs('EPSG:32637')
    if len(industrial) > 0:
        industrial_union = industrial.unary_union
        df['is_industrial'] = gdf_m.geometry.apply(lambda p: industrial_union.contains(p))
    else:
        df['is_industrial'] = False

    try:
        moscow_boundary = ox.geocode_to_gdf('Moscow, Russia').to_crs('EPSG:32637').geometry.unary_union
        df['is_outside_mkad'] = ~gdf_m.geometry.apply(lambda p: moscow_boundary.contains(p))
    except:
        df['is_outside_mkad'] = df['dist_to_center'] > 15000  # fallback

    try:
        subway = ox.features_from_place('Moscow, Russia', tags={'station': 'subway'}).to_crs('EPSG:32637')
        subway = subway[~subway.geometry.isna()]  # —É–±–∏—Ä–∞–µ–º –ø—É—Å—Ç—ã–µ –≥–µ–æ–º–µ—Ç—Ä–∏–∏
        subway = subway[subway.geometry.type == 'Point']  # —Ç–æ–ª—å–∫–æ —Ç–æ—á–∫–∏
    except:
        subway = gpd.GeoDataFrame(geometry=[], crs='EPSG:32637')

    if len(subway) > 0:
        X_subway = subway.geometry.apply(lambda p: [p.x, p.y]).tolist()
        nbrs_subway = NearestNeighbors(n_neighbors=1).fit(X_subway)
        df['dist_to_subway'] = nbrs_subway.kneighbors(X_points)[0].flatten()

        nbrs_count = NearestNeighbors(radius=500).fit(X_subway)
        counts = nbrs_count.radius_neighbors(X_points, return_distance=False)
        df['count_subway_500m'] = [len(c) for c in counts]
    else:
        df['dist_to_subway'] = np.nan
        df['count_subway_500m'] = 0

    return df

In [97]:
df_train_with_geo = add_geo_features(df_train)
df_test_with_geo = add_geo_features(df_test)

  industrial_union = industrial.unary_union
  moscow_boundary = ox.geocode_to_gdf('Moscow, Russia').to_crs('EPSG:32637').geometry.unary_union
  industrial_union = industrial.unary_union
  moscow_boundary = ox.geocode_to_gdf('Moscow, Russia').to_crs('EPSG:32637').geometry.unary_union


In [98]:
def add_subway_features(df):
    gdf = gpd.GeoDataFrame(df, geometry=[Point(xy) for xy in zip(df['longitude'], df['latitude'])], crs='EPSG:4326')
    gdf_m = gdf.to_crs('EPSG:32637')  # –≤ –º–µ—Ç—Ä–∞—Ö
    X_points = gdf_m.geometry.apply(lambda p: [p.x, p.y]).tolist()

    try:
        subway = ox.features_from_place('Moscow, Russia', tags={'station': 'subway'}).to_crs('EPSG:32637')
        subway = subway[~subway.geometry.isna()]  # —É–±–∏—Ä–∞–µ–º –ø—É—Å—Ç—ã–µ –≥–µ–æ–º–µ—Ç—Ä–∏–∏
        subway = subway[subway.geometry.type == 'Point']  # —Ç–æ–ª—å–∫–æ —Ç–æ—á–∫–∏
    except Exception:
        subway = gpd.GeoDataFrame(geometry=[], crs='EPSG:32637')

    if len(subway) > 0:
        X_subway = subway.geometry.apply(lambda p: [p.x, p.y]).tolist()
        nbrs_subway = NearestNeighbors(n_neighbors=1).fit(X_subway)
        df['dist_to_subway'] = nbrs_subway.kneighbors(X_points)[0].flatten()

        nbrs_500 = NearestNeighbors(radius=500).fit(X_subway)
        counts_500 = nbrs_500.radius_neighbors(X_points, return_distance=False)
        df['count_subway_500m'] = [len(c) for c in counts_500]

        nbrs_1000 = NearestNeighbors(radius=1000).fit(X_subway)
        counts_1000 = nbrs_1000.radius_neighbors(X_points, return_distance=False)
        df['count_subway_1000m'] = [len(c) for c in counts_1000]
    else:
        df['dist_to_subway'] = np.nan
        df['count_subway_500m'] = 0
        df['count_subway_1000m'] = 0

    return df

In [99]:
df_train_geo = add_subway_features(df_train_with_geo)
df_test_geo = add_subway_features(df_test_with_geo)

In [100]:
import osmnx as ox
print("OSMnx version:", ox.__version__)

OSMnx version: 2.0.6


In [101]:
df_train = df_train_geo.copy()
df_test = df_test_geo.copy()

df_train = df_train.merge(reviews_by_id, on='id', how='left')
df_test = df_test.merge(reviews_by_id, on='id', how='left')

df_train.loc[:, reviews_by_id.columns.to_list()] = df_train.loc[:, reviews_by_id.columns.to_list()].fillna(value=0)
df_test.loc[:, reviews_by_id.columns.to_list()] = df_test.loc[:, reviews_by_id.columns.to_list()].fillna(value=0)

df_test.head(3)

Unnamed: 0,id,name,category,address,traffic_300m,homes_300m,works_300m,female_300m,train_ticket_order_300m,mortgage_300m,...,dist_to_transport_hub,dist_to_major_intersection,is_industrial,is_outside_mkad,dist_to_subway,count_subway_500m,count_subway_1000m,reviews_count,reviews_mean,reviews_std
0,21472,–°—á–∞—Å—Ç—å–µ,candy_shop,"–•–æ–¥—ã–Ω—Å–∫–∏–π –±—É–ª., 4, –ú–æ—Å–∫–≤–∞",62672,4709.11052,4298.1253,38987.0,961.0,38.0,...,410.24409,1744.53489,False,False,410.24409,1,1,10.0,150.0,104.41477
1,9837,O'STIN,baby_clothes,"–ù–æ—Å–æ–≤–∏—Ö–∏–Ω—Å–∫–æ–µ —à., 45, –†–µ—É—Ç–æ–≤",110226,12987.98926,15235.25666,96081.0,1346.0,88.0,...,1620.1185,3393.72031,False,True,1620.1185,0,0,10.0,94.7,70.5991
2,41791,–î—Ä–æ–≤–æ—Å–µ–∫,barbershop,"–ë–µ–ª–æ–º–æ—Ä—Å–∫–∞—è —É–ª., 18–ê, –∫–æ—Ä–ø. 2, –ú–æ—Å–∫–≤–∞",81080,9575.24857,9463.3229,57147.0,1506.0,83.0,...,160.1347,748.56641,False,False,160.1347,1,2,0.0,0.0,0.0


## TF-IDF + SVD

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

grouped = reviews.groupby('id')['text'].apply(lambda x: ' '.join(x)).reset_index()
grouped.columns = ['id', 'combined_text']

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(grouped['combined_text'])

n_components = 100
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_lsa = svd.fit_transform(X_tfidf)

In [103]:
lsa_df = pd.DataFrame(X_lsa, columns=[f'topic_{i+1}' for i in range(n_components)])
lsa_df.head()

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_91,topic_92,topic_93,topic_94,topic_95,topic_96,topic_97,topic_98,topic_99,topic_100
0,0.2764,-0.04239,0.16839,0.00086,-0.04856,-0.19697,0.05968,0.11204,0.11691,-0.08721,...,0.00389,0.00117,-0.0062,-0.00106,-0.00707,-0.0146,0.03193,0.00246,0.01948,-0.01363
1,0.229,-0.04959,-0.04423,-0.08137,-0.06309,0.05374,-0.05059,-0.05519,-0.00918,0.00709,...,0.01174,-0.03462,0.02274,0.03869,0.00526,-0.02217,-0.01364,-0.00295,-0.02868,-0.03318
2,0.19791,0.04981,-0.01125,-0.06053,0.05184,0.01373,0.0227,0.05743,-0.0069,-0.01861,...,0.02743,0.00475,-0.05894,0.01766,0.01773,0.02396,0.05848,1e-05,0.00607,0.01585
3,0.07965,-0.01836,-0.00399,-0.00792,-0.00887,0.00438,0.01345,-0.00034,-0.0151,-0.01397,...,-0.01545,-0.00434,-0.01183,0.00816,0.00668,0.00472,-0.0003,-0.02147,-0.01127,0.0163
4,0.10056,-0.0185,-0.01293,-0.02666,-0.01516,0.01964,0.01642,0.01411,0.02648,-0.02108,...,-0.00773,0.05709,-0.02073,-0.01374,-0.01819,0.0043,0.02837,-0.02793,0.00119,-0.0287


In [104]:
result_df = pd.concat([grouped[['id']], lsa_df], axis=1)
result_df.head()

Unnamed: 0,id,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_91,topic_92,topic_93,topic_94,topic_95,topic_96,topic_97,topic_98,topic_99,topic_100
0,1,0.2764,-0.04239,0.16839,0.00086,-0.04856,-0.19697,0.05968,0.11204,0.11691,...,0.00389,0.00117,-0.0062,-0.00106,-0.00707,-0.0146,0.03193,0.00246,0.01948,-0.01363
1,2,0.229,-0.04959,-0.04423,-0.08137,-0.06309,0.05374,-0.05059,-0.05519,-0.00918,...,0.01174,-0.03462,0.02274,0.03869,0.00526,-0.02217,-0.01364,-0.00295,-0.02868,-0.03318
2,3,0.19791,0.04981,-0.01125,-0.06053,0.05184,0.01373,0.0227,0.05743,-0.0069,...,0.02743,0.00475,-0.05894,0.01766,0.01773,0.02396,0.05848,1e-05,0.00607,0.01585
3,4,0.07965,-0.01836,-0.00399,-0.00792,-0.00887,0.00438,0.01345,-0.00034,-0.0151,...,-0.01545,-0.00434,-0.01183,0.00816,0.00668,0.00472,-0.0003,-0.02147,-0.01127,0.0163
4,6,0.10056,-0.0185,-0.01293,-0.02666,-0.01516,0.01964,0.01642,0.01411,0.02648,...,-0.00773,0.05709,-0.02073,-0.01374,-0.01819,0.0043,0.02837,-0.02793,0.00119,-0.0287


In [105]:
df_train = df_train.merge(result_df, on='id', how='left')
df_test = df_test.merge(result_df, on='id', how='left')

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

df_train.head()

Unnamed: 0,id,name,category,address,target,traffic_300m,homes_300m,works_300m,female_300m,train_ticket_order_300m,...,topic_91,topic_92,topic_93,topic_94,topic_95,topic_96,topic_97,topic_98,topic_99,topic_100
0,1365,"–ì–æ—Ä–æ–¥—Å–∫–∞—è –ø–æ–ª–∏–∫–ª–∏–Ω–∏–∫–∞ ‚Ññ 109, —Ñ–∏–ª–∏–∞–ª ‚Ññ 2",health,"–ì—Ä–∞–π–≤–æ—Ä–æ–Ω–æ–≤—Å–∫–∞—è —É–ª., 18, –∫–æ—Ä–ø. 1, –ú–æ—Å–∫–≤–∞",4.1,75429,16113.58247,15756.24644,51316.0,734.0,...,-0.02682,0.00059,-0.01053,-0.02581,-0.01712,-0.00446,0.01777,-0.00136,-0.01386,-0.02538
1,8230,Wellness Club Nebo,swimming_pool,"–ü—Ä–µ—Å–Ω–µ–Ω—Å–∫–∞—è –Ω–∞–±., 12, –ú–æ—Å–∫–≤–∞",3.6,246535,8578.45874,31315.67279,192547.0,4701.0,...,-0.01483,0.02213,0.03195,0.00969,0.01073,-0.00211,-0.01096,-0.02347,0.01733,-0.00302
2,29071,–ü–µ—Ä–≤—ã–π –ú–ì–ú–£ –∏–º–µ–Ω–∏ –°–µ—á–µ–Ω–æ–≤–∞,higher_education_institutions,"—É–ª. –ü–ª—é—â–∏—Ö–∞, 57, —Å—Ç—Ä. 1, –ú–æ—Å–∫–≤–∞",3.5,83490,12650.4924,12490.09678,56045.0,1204.0,...,-0.01271,0.02311,-0.03478,-0.01119,-0.00999,0.01287,-0.00299,0.0065,0.0072,-0.01751
3,22591,your SPAce,cosmetology,"–ø—Ä–æ—Å–ø. –ú–∞—Ä—à–∞–ª–∞ –ñ—É–∫–æ–≤–∞, 59, –ú–æ—Å–∫–≤–∞",4.0,43421,6413.27922,4963.05805,28705.0,540.0,...,0.00726,0.02097,0.01992,-0.00284,-0.02569,-0.00788,0.0252,0.02798,0.00655,-0.01398
4,27621,–°–æ—é–∑–¶–≤–µ—Ç–¢–æ—Ä–≥,flower_delivery,"–ü—Ä–µ–æ–±—Ä–∞–∂–µ–Ω—Å–∫–∞—è –ø–ª–æ—â–∞–¥—å, 6, –ú–æ—Å–∫–≤–∞",4.2,155094,18638.41013,19868.19057,123492.0,2411.0,...,-0.02368,0.01272,0.03803,-0.00579,-0.00203,0.04122,0.05117,0.02213,-0.00947,0.0341


In [106]:
def city_convertation(df):
    df['city_1'] = df['address'].str.split(',').str[-1].str.strip()
    df['city_2'] = df['address'].str.split(',').str[0].str.strip()
    
    valid_city_patterns = {
        'single_word': r'^[–ê-–Ø–Å][–∞-—è—ë]*$',
        'd_with_word': r'^–¥\.\s+[–ê-–Ø–Å][–∞-—è—ë]*$',
        'poselok': r'^–ø–æ—Å[–µ—ë]–ª–æ–∫',
        'selo': r'^—Å–µ–ª–æ\s+[–ê-–Ø–Å][–∞-—è—ë]*(\s+[–ê-–Ø–Å][–∞-—è—ë]*)*$',  # <-- –¥–æ–±–∞–≤–ª–µ–Ω–æ
    }
    combined_pattern = '|'.join(f"({pattern})" for pattern in valid_city_patterns.values())
    mask = df['city_2'].str.contains(combined_pattern, case=True, regex=True, na=False)
    
    df['city_1'] = df['city_1'].where(~mask, df['city_2'])

    valid_city_patterns = {
        'poselok': r'–ø–æ—Å[–µ—ë]–ª–æ–∫',  # –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç "–ø–æ—Å–µ–ª–æ–∫" –∏ "–ø–æ—Å—ë–ª–æ–∫"
        'selo': r'—Å–µ–ª–æ\s+',
        'd_with_word': r'^–¥\.\s+[–ê-–Ø–Å][–∞-—è—ë]*$',  # –æ—Å—Ç–∞–≤–ª—è–µ–º –∫–∞–∫ –µ—Å—Ç—å
    }

    # –°–æ–∑–¥–∞—ë–º –±—É–ª–µ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏
    for col_name, pattern in valid_city_patterns.items():
        df[col_name] = df['city_1'].str.contains(pattern, case=True, regex=True, na=False)
    
    df.drop(columns=['city_2'], inplace=True)
    return df

In [107]:
df_train = city_convertation(df_train)
df_test = city_convertation(df_test)

  mask = df['city_2'].str.contains(combined_pattern, case=True, regex=True, na=False)
  mask = df['city_2'].str.contains(combined_pattern, case=True, regex=True, na=False)


In [108]:
print(len(df_test.city_1.value_counts()))
print(len(df_train.city_1.value_counts()))

258
565


# Model Selection

In [109]:
from sklearn.model_selection import train_test_split

df_train.dropna(axis=0, subset='city_1', inplace=True)

X = df_train.copy()
X_test = df_test.copy()

X_ctb = X.drop(columns=['target', 'id', 'latitude', 'longitude'])
X_test_ctb = X_test.drop(columns=['id', 'latitude', 'longitude'])

In [110]:
valid_cities = X.city_1.value_counts().head(108).index.to_list()
valid_names = X_test.name.value_counts().head(180).index.to_list()

def one_hot_top_cities_and_names(df, city_col, top_cities, name_col, top_names):
    df = df.copy()
    
    for city in top_cities:
        df[f'city_{city}'] = (df[city_col] == city).astype(int)

    for name in top_names:
        df[f'name_{name}'] = (df[name_col] == name).astype(int)
    
    df = df.drop(columns=[city_col, name_col])    
    return df

X_ctb_encoded = one_hot_top_cities_and_names(X_ctb, 'city_1', valid_cities, 'name', valid_names)
X_ctb_test_encoded = one_hot_top_cities_and_names(X_test_ctb, 'city_1', valid_cities, 'name', valid_names)

assert list(X_ctb_encoded.columns) == list(X_ctb_test_encoded.columns), "–ö–æ–ª–æ–Ω–∫–∏ –Ω–µ —Å–æ–≤–ø–∞–¥–∞—é—Ç!"
print(f"‚úÖ –°–æ–∑–¥–∞–Ω–æ {len(valid_cities) + 180} one-hot –∫–æ–ª–æ–Ω–æ–∫.")
print(f"–§–æ—Ä–º–∞ train: {X_ctb_encoded.shape}, test: {X_ctb_test_encoded.shape}")

  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{n

‚úÖ –°–æ–∑–¥–∞–Ω–æ 288 one-hot –∫–æ–ª–æ–Ω–æ–∫.
–§–æ—Ä–º–∞ train: (37084, 683), test: (9276, 683)


  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'city_{city}'] = (df[city_col] == city).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{name}'] = (df[name_col] == name).astype(int)
  df[f'name_{n

In [111]:
X = X.drop(columns=['target', 'id', 'latitude', 'longitude', 'address'])
X_test = X_test.drop(columns=['id', 'latitude', 'longitude', 'address'])

y = df_train['target']

In [112]:
from sklearn.preprocessing import StandardScaler

def scale_features(X_train, X_test):
    for col in X_train.select_dtypes(exclude='object').columns.to_list():
        scaler = StandardScaler()
        X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1)).flatten()
        X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1)).flatten()    
    return X_train, X_test

X_scaled, X_test_scaled = scale_features(X, X_test)

In [113]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=52)

## CatBoost

In [114]:
X_scaled.head()

Unnamed: 0,name,category,traffic_300m,homes_300m,works_300m,female_300m,train_ticket_order_300m,mortgage_300m,recipes_300m,online_shops_300m,...,topic_95,topic_96,topic_97,topic_98,topic_99,topic_100,city_1,poselok,selo,d_with_word
0,"–ì–æ—Ä–æ–¥—Å–∫–∞—è –ø–æ–ª–∏–∫–ª–∏–Ω–∏–∫–∞ ‚Ññ 109, —Ñ–∏–ª–∏–∞–ª ‚Ññ 2",health,-0.31911,0.48871,0.25006,-0.38091,-0.52527,-0.53153,-0.3725,-0.42404,...,-0.84118,-0.2273,0.88185,-0.06534,-0.71824,-1.27557,–ú–æ—Å–∫–≤–∞,-0.16767,-0.05931,-0.11595
1,Wellness Club Nebo,swimming_pool,1.99889,-0.56439,2.17826,1.98362,2.33581,3.62922,2.47606,2.79141,...,0.52927,-0.11072,-0.54978,-1.17735,0.85993,-0.14783,–ú–æ—Å–∫–≤–∞,-0.16767,-0.05931,-0.11595
2,–ü–µ—Ä–≤—ã–π –ú–ì–ú–£ –∏–º–µ–Ω–∏ –°–µ—á–µ–Ω–æ–≤–∞,higher_education_institutions,-0.2099,0.00471,-0.15469,-0.30174,-0.1863,-0.25415,-0.319,-0.32324,...,-0.49044,0.63357,-0.15249,0.32976,0.34732,-0.87846,–ú–æ—Å–∫–≤–∞,-0.16767,-0.05931,-0.11595
3,your SPAce,cosmetology,-0.75273,-0.867,-1.08748,-0.75947,-0.66519,-0.42366,-0.74175,-0.68696,...,-1.26305,-0.39716,1.25243,1.4104,0.3141,-0.70079,–ú–æ—Å–∫–≤–∞,-0.16767,-0.05931,-0.11595
4,–°–æ—é–∑–¶–≤–µ—Ç–¢–æ—Ä–≥,flower_delivery,0.76013,0.84158,0.75964,0.82748,0.68421,0.39308,0.85167,0.93034,...,-0.09871,2.04177,2.54644,1.1162,-0.49604,1.72469,–ú–æ—Å–∫–≤–∞,-0.16767,-0.05931,-0.11595


In [115]:
# from catboost import CatBoostRegressor

# model = CatBoostRegressor(
#     n_estimators=8000,
#     learning_rate=0.022,
#     max_depth=6,
#     eval_metric='MAE',
#     verbose=100,
#     task_type='GPU',
#     random_seed=42,
# )

# model.fit(X_scaled, y, cat_features=[0, 1, 393])

with test

In [116]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, shuffle=True
)

In [117]:
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

In [118]:
model = CatBoostRegressor(
    n_estimators=4000,
    learning_rate=0.022,
    max_depth=6,
    eval_metric='MAE',
    verbose=100,
    task_type='GPU',
    early_stopping_rounds=400,
    random_seed=42,
)

model.fit(
    X_train,
    y_train,
    cat_features=[0, 1, 393],
    eval_set=(X_test, y_test),
    use_best_model=True
)

y_pred = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.4f}")

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3257514	test: 0.3266063	best: 0.3266063 (0)	total: 72.3ms	remaining: 4m 49s
100:	learn: 0.2558123	test: 0.2542312	best: 0.2542312 (100)	total: 5.99s	remaining: 3m 51s
200:	learn: 0.2454491	test: 0.2452362	best: 0.2452362 (200)	total: 11.7s	remaining: 3m 42s
300:	learn: 0.2406604	test: 0.2419197	best: 0.2419197 (300)	total: 17.7s	remaining: 3m 37s
400:	learn: 0.2374548	test: 0.2400147	best: 0.2400147 (400)	total: 23.4s	remaining: 3m 29s
500:	learn: 0.2347632	test: 0.2386927	best: 0.2386846 (499)	total: 29.3s	remaining: 3m 24s
600:	learn: 0.2324054	test: 0.2376820	best: 0.2376820 (600)	total: 35.3s	remaining: 3m 19s
700:	learn: 0.2302786	test: 0.2369822	best: 0.2369811 (699)	total: 41.1s	remaining: 3m 13s
800:	learn: 0.2281876	test: 0.2362856	best: 0.2362856 (800)	total: 47.3s	remaining: 3m 8s
900:	learn: 0.2264414	test: 0.2357359	best: 0.2357359 (900)	total: 53.2s	remaining: 3m 2s
1000:	learn: 0.2247812	test: 0.2352979	best: 0.2352979 (1000)	total: 59.1s	remaining: 2m 57s
11

In [119]:
feat_imp = model.get_feature_importance(prettified=True)
low_imp = feat_imp[feat_imp['Importances'] < 0.1]['Feature Id'].tolist()
len(low_imp)

276

In [120]:
X_reshaped = X_ctb_encoded.drop(columns=low_imp)
X_test_reshaped = X_ctb_test_encoded.drop(columns=low_imp)

In [121]:
X_reshaped.head()

Unnamed: 0,category,address,homes_300m,works_300m,preschool_300m,computer_games_300m,economics_300m,game_consoles_300m,baby_food_300m,premium_class_cars_300m,...,name_–ù–∏–∫–∞,name_–ë–∏–∫–æ,name_–°—á–∞—Å—Ç–ª–∏–≤—ã–π –≤–∑–≥–ª—è–¥,name_–¶–≤–µ—Ç–æ—á–Ω—ã–π —Å–∫–ª–∞–¥,name_–ú–æ–π–∫–∞,name_–ü–æ–¥—Ä—É–∂–∫–∏,name_Familia,name_–ü–µ—Ä–µ–∫—Ä—ë—Å—Ç–æ–∫,name_–ì–∞—Ä–º–æ–Ω–∏—è,name_–†–æ—Å–≥–æ—Å—Å—Ç—Ä–∞—Ö
0,health,"–ì—Ä–∞–π–≤–æ—Ä–æ–Ω–æ–≤—Å–∫–∞—è —É–ª., 18, –∫–æ—Ä–ø. 1, –ú–æ—Å–∫–≤–∞",16113.58247,15756.24644,929.0,64.0,2148.0,37.0,74.0,1436.0,...,0,0,0,0,0,0,0,0,0,0
1,swimming_pool,"–ü—Ä–µ—Å–Ω–µ–Ω—Å–∫–∞—è –Ω–∞–±., 12, –ú–æ—Å–∫–≤–∞",8578.45874,31315.67279,3005.0,198.0,17755.0,387.0,190.0,6436.0,...,0,0,0,0,0,0,0,0,0,0
2,higher_education_institutions,"—É–ª. –ü–ª—é—â–∏—Ö–∞, 57, —Å—Ç—Ä. 1, –ú–æ—Å–∫–≤–∞",12650.4924,12490.09678,922.0,56.0,3229.0,78.0,84.0,1495.0,...,0,0,0,0,0,0,0,0,0,0
3,cosmetology,"–ø—Ä–æ—Å–ø. –ú–∞—Ä—à–∞–ª–∞ –ñ—É–∫–æ–≤–∞, 59, –ú–æ—Å–∫–≤–∞",6413.27922,4963.05805,455.0,17.0,1842.0,31.0,75.0,987.0,...,0,0,0,0,0,0,0,0,0,0
4,flower_delivery,"–ü—Ä–µ–æ–±—Ä–∞–∂–µ–Ω—Å–∫–∞—è –ø–ª–æ—â–∞–¥—å, 6, –ú–æ—Å–∫–≤–∞",18638.41013,19868.19057,1723.0,179.0,6680.0,234.0,130.0,2996.0,...,0,0,0,0,0,0,0,0,0,0


# Prediction

In [122]:
y_test = model.predict(X_test_scaled.fillna('–ú–æ—Å–∫–≤–∞'))

In [None]:
submit = pd.DataFrame({'id': df_test['id'], 'target': y_test})
submit.to_csv('cshka-pobeda-plz-5.csv', index=False)