In [1]:
from fuzzywuzzy import fuzz, process
import numpy as np
import pandas as pd
import tqdm

In [2]:
pd.set_option("display.max_columns", None)
%precision 3

'%.3f'

In [71]:
rsmp = pd.read_csv("rsmp/csv/data.csv")
settlements = pd.read_csv("settlements_12032021/data.csv")
cities_base = pd.read_csv("city.csv")
cities_additional = pd.read_csv("cities_additional.csv")
socr = pd.read_csv("socrbase.csv")

cities = pd.concat((cities_base, cities_additional))
rsmp = rsmp.reset_index(names="id")

In [6]:
socr_full_to_full = socr[["name_full", "name_full"]]
socr_full_to_full.columns = ("name", "name_full")
socr_full_to_full.head(3)

Unnamed: 0,name,name_full
0,Автономный округ,Автономный округ
1,Автономная область,Автономная область
2,Город,Город


In [7]:
socr_without_dot = socr.loc[~socr["name"].str.endswith("."), ["name", "name_full"]]
socr_without_dot["name"] = socr_without_dot["name"] + "."
socr_without_dot.head(3)

Unnamed: 0,name,name_full
0,АО.,Автономный округ
1,Аобл.,Автономная область
2,г.,Город


In [8]:
abbr_to_full = pd.concat(
    (socr[["name", "name_full"]], socr_full_to_full, socr_without_dot)
)
abbr_to_full = abbr_to_full.apply(lambda x: x.str.upper())
abbr_to_full.drop_duplicates("name", inplace=True)
abbr_to_full.head(10)

Unnamed: 0,name,name_full
0,АО,АВТОНОМНЫЙ ОКРУГ
1,АОБЛ,АВТОНОМНАЯ ОБЛАСТЬ
2,Г,ГОРОД
3,КРАЙ,КРАЙ
4,ОБЛ,ОБЛАСТЬ
5,РЕСП,РЕСПУБЛИКА
6,ОКРУГ,ОКРУГ
7,ЧУВАШИЯ,ЧУВАШИЯ
8,А.ОБЛ.,АВТОНОМНАЯ ОБЛАСТЬ
9,А.ОКР.,АВТОНОМНЫЙ ОКРУГ


In [72]:
address_cols = [
    "region_name",
    "region_type",
    "district_name",
    "district_type",
    "city_name",
    "city_type",
    "settlement_name",
    "settlement_type",
]
addresses = rsmp.loc[:, ["id"] + address_cols]
addresses.head(2)

Unnamed: 0,id,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type
0,0,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,
1,1,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,


In [14]:
addr_count = len(addresses)
addr_count

172095

In [73]:
addresses.isna().sum()

id                      0
region_name            20
region_type            20
district_name      143985
district_type      143985
city_name           57279
city_type           57279
settlement_name    151398
settlement_type    151398
dtype: int64

In [74]:
def join_name_and_type(name, type_):
    if pd.isna(name) or pd.isna(type_):
        return np.nan
    
    prepend_types = (
        "ГОРОД", "РЕСПУБЛИКА", "ПОСЕЛОК", "ПОСЕЛОК ГОРОДСКОГО ТИПА", "РАБОЧИЙ ПОСЕЛОК"
    )
    prepend = type_ in prepend_types
    if prepend:
        return f"{type_} {name}"
    
    return f"{name} {type_}"

In [75]:
def preprocess_text_column(c):
    return c.str.upper().replace("Ё", "Е")

In [76]:
for option in ("region", "district", "city", "settlement"):
    target_col = f"{option}_type"
    addresses = addresses.merge(
        abbr_to_full,
        how="left",
        left_on=target_col,
        right_on="name",
    )
    addresses[target_col] = addresses["name_full"]
    addresses.drop(columns=abbr_to_full.columns, inplace=True)
    assert len(addresses) == addr_count, (
        f"Number of addresses must not change, but for {target_col} "
        f"the size has changed: {addr_count} -> {len(addresses)}"
    )
    
    parts = [f"{option}_name", f"{option}_type"]
    addresses[option] = addresses[parts].apply(
        lambda row: join_name_and_type(row[parts[0]], row[parts[1]]),
        axis=1
    )
addresses.isna().sum()

id                      0
region_name            20
region_type            20
district_name      143985
district_type      143985
city_name           57279
city_type           57279
settlement_name    151398
settlement_type    151403
region                 20
district           143985
city                57279
settlement         151403
dtype: int64

In [77]:
unique_region_names = addresses["region_name"].dropna().unique()
unique_regions_in_cities = cities["region"].dropna().unique()
region_names_to_cities = pd.DataFrame({
    "region_name": unique_region_names,
    "region_cities": [
        process.extractOne(region_name, unique_regions_in_cities)[0].upper()
        for region_name in unique_region_names
    ]
})
region_names_to_cities.head(3)

Unnamed: 0,region_name,region_cities
0,БАШКОРТОСТАН,БАШКОРТОСТАН
1,БУРЯТИЯ,БУРЯТИЯ
2,ДАГЕСТАН,ДАГЕСТАН


In [78]:
unique_regions = addresses["region"].dropna().unique()
unique_regions_in_settlements = settlements["region"].dropna().unique()
regions_to_settlements = pd.DataFrame({
    "region": unique_regions,
    "region_settlements": [
        process.extractOne(
            region_name,
            unique_regions_in_settlements,
            scorer=fuzz.token_set_ratio
        )[0].upper()
        for region_name in unique_regions
    ]
})
regions_to_settlements.head(3)

Unnamed: 0,region,region_settlements
0,РЕСПУБЛИКА БАШКОРТОСТАН,РЕСПУБЛИКА БАШКОРТОСТАН
1,РЕСПУБЛИКА БУРЯТИЯ,РЕСПУБЛИКА БУРЯТИЯ
2,РЕСПУБЛИКА ДАГЕСТАН,РЕСПУБЛИКА ДАГЕСТАН


In [79]:
addresses = addresses.merge(region_names_to_cities, how="left", on="region_name")
addresses = addresses.merge(regions_to_settlements, how="left", on="region")
addresses.iloc[:, 1:] = addresses.iloc[:, 1:].apply(preprocess_text_column)
assert len(addresses) == addr_count
addresses.head(2)

Unnamed: 0,id,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,region,district,city,settlement,region_cities,region_settlements
0,0,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,РЕСПУБЛИКА БАШКОРТОСТАН,,ГОРОД УФА,,БАШКОРТОСТАН,РЕСПУБЛИКА БАШКОРТОСТАН
1,1,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,РЕСПУБЛИКА БАШКОРТОСТАН,,ГОРОД УФА,,БАШКОРТОСТАН,РЕСПУБЛИКА БАШКОРТОСТАН


In [80]:
std_c = cities.reset_index(names="id")[["id", "region", "area", "city", "settlement"]]
std_c.iloc[:, 1:] = std_c.iloc[:, 1:].apply(preprocess_text_column)
std_c.head(3)

Unnamed: 0,id,region,area,city,settlement
0,0,АДЫГЕЯ,,АДЫГЕЙСК,
1,1,АДЫГЕЯ,,МАЙКОП,
2,2,АЛТАЙ,,ГОРНО-АЛТАЙСК,


In [98]:
std_s = settlements.loc[:, ["id", "region", "municipality", "settlement", "type"]]
std_s["type"] = std_s["type"].str.upper()
std_s = std_s.merge(
    abbr_to_full,
    how="left",
    left_on="type",
    right_on="name"
)
std_s["type"] = std_s["name_full"]
std_s.drop(columns=abbr_to_full.columns, inplace=True)
std_s.iloc[:, 1:] = std_s.iloc[:, 1:].apply(preprocess_text_column)
std_s.head(3)

Unnamed: 0,id,region,municipality,settlement,type
0,0,ОРЛОВСКАЯ ОБЛАСТЬ,БОЛХОВСКИЙ,КОЛОНТАЕВА,ДЕРЕВНЯ
1,1,РЕСПУБЛИКА КРЫМ,АЛУШТА,ПУШКИНО,СЕЛО
2,2,ЛИПЕЦКАЯ ОБЛАСТЬ,ЛЕВ-ТОЛСТОВСКИЙ РАЙОН,БАРЯТИНО,СЕЛО


In [117]:
merge_options = [
    {
        "name": "Settlements by all parts with full district name",
        "addresses": ["region_settlements", "district", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements"
    },
    {
        "name": "Settlements by all parts with partial district name (no type)",
        "addresses": ["region_settlements", "district_name", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts with full city name",
        "addresses": ["region_settlements", "city", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts with partial city name (no type)",
        "addresses": ["region_settlements", "city_name", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with full district name",
        "addresses": ["region_settlements", "district", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with partial district name",
        "addresses": ["region_settlements", "district_name", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with full city name",
        "addresses": ["region_settlements", "city", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with partial city name",
        "addresses": ["region_settlements", "city_name", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by region and settlement with type",
        "addresses": ["region_settlements", "settlement_name", "settlement_type"],
        "standard": ["region", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by region and settlement without type",
        "addresses": ["region_settlements", "settlement_name"],
        "standard": ["region", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Cities by all parts",
        "addresses": ["region_cities", "district_name", "city_name", "settlement_name"],
        "standard": ["region", "area", "city", "settlement"],
        "type": "cities",
    },
    {
        "name": "Cities by all parts except for settlements",
        "addresses": ["region_cities", "district_name", "city_name"],
        "standard": ["region", "area", "city"],
        "type": "cities",
    },
    {
        "name": "Cities by region and city",
        "addresses": ["region_cities", "city_name"],
        "standard": ["region", "city"],
        "type": "cities",
    },
    {
        "name": "Cities by region and district-as-city",
        "addresses": ["region_cities", "city_name"],
        "standard": ["region", "area"],
        "type": "cities",
    },    
]

In [118]:
mappings = []
rest = addresses
orig_cols = addresses.columns
for option in merge_options:
    name = option["name"]
    left_cols = option["addresses"]
    right_cols = option["standard"]
    type_ = option["type"]

    to_merge = rest[orig_cols]
    standard = std_c.copy() if type_ == "cities" else std_s.copy()
    standard.drop_duplicates(subset=right_cols, keep=False, inplace=True)
    standard.rename(columns={"id": "geo_id"}, inplace=True)
    #print(to_merge.columns)
    #print(standard.columns)
    #print(standard.shape)
    
    size_before = len(to_merge)
    merged = to_merge.merge(
        standard,
        how="left",
        left_on=left_cols,
        right_on=right_cols,
        suffixes=("", "_x")
    )
    #print(len(merged))
    
    size_after = len(merged)
    assert size_before == size_after
    
    mapped = merged.loc[merged["geo_id"].notna(), ["id", "geo_id"]]
    mapped["type"] = type_[0]
    if len(mapped) > 0:
        mappings.append(mapped)
        
    rest = merged.loc[merged["geo_id"].isna()]
    
    print(f"Option {name}: found {len(mapped)} matches, {len(rest)} records left")

Option Settlements by all parts with full district name: found 7376 matches, 164719 records left
Option Settlements by all parts with partial district name (no type): found 3137 matches, 161582 records left
Option Settlements by all parts with full city name: found 139 matches, 161443 records left
Option Settlements by all parts with partial city name (no type): found 491 matches, 160952 records left
Option Settlements by all parts except for type with full district name: found 1497 matches, 159455 records left
Option Settlements by all parts except for type with partial district name: found 638 matches, 158817 records left
Option Settlements by all parts except for type with full city name: found 191 matches, 158626 records left
Option Settlements by all parts except for type with partial city name: found 198 matches, 158428 records left
Option Settlements by region and settlement with type: found 2566 matches, 155862 records left
Option Settlements by region and settlement without ty

In [119]:
rest

Unnamed: 0,id,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,region,district,city,settlement,region_cities,region_settlements,geo_id,region_x,area,city_x,settlement_x
0,148,МОСКОВСКАЯ,ОБЛАСТЬ,,,ДМИТРОВ,ГОРОД,,,МОСКОВСКАЯ ОБЛАСТЬ,,ГОРОД ДМИТРОВ,,МОСКОВСКАЯ,МОСКОВСКАЯ ОБЛАСТЬ,,,,,
1,151,МОСКОВСКАЯ,ОБЛАСТЬ,,,ОДИНЦОВО,ГОРОД,,,МОСКОВСКАЯ ОБЛАСТЬ,,ГОРОД ОДИНЦОВО,,МОСКОВСКАЯ,МОСКОВСКАЯ ОБЛАСТЬ,,,,,
2,153,МОСКОВСКАЯ,ОБЛАСТЬ,,,ОРЕХОВО-ЗУЕВО,ГОРОД,,,МОСКОВСКАЯ ОБЛАСТЬ,,ГОРОД ОРЕХОВО-ЗУЕВО,,МОСКОВСКАЯ,МОСКОВСКАЯ ОБЛАСТЬ,,,,,
3,531,ЛУГАНСКАЯ НАРОДНАЯ,РЕСПУБЛИКА,,,ЛУТУГИНО,ГОРОД,,,РЕСПУБЛИКА ЛУГАНСКАЯ НАРОДНАЯ,,ГОРОД ЛУТУГИНО,,КУРГАНСКАЯ,РЕСПУБЛИКА КРЫМ,,,,,
4,532,ЛУГАНСКАЯ НАРОДНАЯ,РЕСПУБЛИКА,ЛУТУГИНСКИЙ,РАЙОН,ЛУТУГИНО,ГОРОД,,,РЕСПУБЛИКА ЛУГАНСКАЯ НАРОДНАЯ,ЛУТУГИНСКИЙ РАЙОН,ГОРОД ЛУТУГИНО,,КУРГАНСКАЯ,РЕСПУБЛИКА КРЫМ,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4111,171852,МОСКОВСКАЯ,ОБЛАСТЬ,ЩЕЛКОВСКИЙ,РАЙОН,ЩЕЛКОВО,ГОРОД,,,МОСКОВСКАЯ ОБЛАСТЬ,ЩЕЛКОВСКИЙ РАЙОН,ГОРОД ЩЕЛКОВО,,МОСКОВСКАЯ,МОСКОВСКАЯ ОБЛАСТЬ,,,,,
4112,171853,МОСКОВСКАЯ,ОБЛАСТЬ,,,ШАХОВСКАЯ,РАБОЧИЙ ПОСЕЛОК,,,МОСКОВСКАЯ ОБЛАСТЬ,,РАБОЧИЙ ПОСЕЛОК ШАХОВСКАЯ,,МОСКОВСКАЯ,МОСКОВСКАЯ ОБЛАСТЬ,,,,,
4113,171881,НОВОСИБИРСКАЯ,ОБЛАСТЬ,ИСКИТИМСКИЙ,РАЙОН,,,,,НОВОСИБИРСКАЯ ОБЛАСТЬ,ИСКИТИМСКИЙ РАЙОН,,,НОВОСИБИРСКАЯ,НОВОСИБИРСКАЯ ОБЛАСТЬ,,,,,
4114,171952,САРАТОВСКАЯ,ОБЛАСТЬ,ДЕРГАЧЕВСКИЙ,РАЙОН,,,ВОСТОЧНЫЙ,ПОСЕЛОК,САРАТОВСКАЯ ОБЛАСТЬ,ДЕРГАЧЕВСКИЙ РАЙОН,,ПОСЕЛОК ВОСТОЧНЫЙ,САРАТОВСКАЯ,САРАТОВСКАЯ ОБЛАСТЬ,,,,,


In [None]:
mappings = []
rest = addresses
orig_cols = addresses.columns
for option in merge_options:
    name = option["name"]
    left_cols = option["addresses"]
    right_cols = option["standard"]
    type_ = option["type"]

    to_merge = rest[orig_cols]
    standard = std_c.copy() if type_ == "cities" else std_s.copy()
    standard.drop_duplicates(subset=right_cols, keep=False, inplace=True)
    standard.rename(columns={"id": "geo_id"}, inplace=True)
    #print(to_merge.columns)
    #print(standard.columns)
    #print(standard.shape)
    
    size_before = len(to_merge)
    merged = to_merge.merge(
        standard,
        how="left",
        left_on=left_cols,
        right_on=right_cols,
        suffixes=("", "_x")
    )
    #print(len(merged))
    
    size_after = len(merged)
    assert size_before == size_after
    
    mapped = merged.loc[merged["geo_id"].notna(), ["id", "geo_id"]]
    mapped["type"] = type_[0]
    if len(mapped) > 0:
        mappings.append(mapped)
        
    rest = merged.loc[merged["geo_id"].isna()]
    
    print(f"Option {name}: found {len(mapped)} matches, {len(rest)} records left")

In [87]:
mappings

[            id  id_settlements
 20          20         70137.0
 21          21        142764.0
 41          41          9048.0
 63          63        109043.0
 64          64         65608.0
 ...        ...             ...
 172020  172020         30374.0
 172025  172025        131816.0
 172026  172026        131816.0
 172086  172086         99905.0
 172089  172089         99905.0
 
 [7376 rows x 2 columns],
             id  id_settlements
 8            8         39110.0
 441        441        145456.0
 445        445         38655.0
 454        454         19266.0
 496        496         29955.0
 ...        ...             ...
 171825  171825        118429.0
 171921  171921        137887.0
 171923  171923         68188.0
 172001  172001           366.0
 172031  172031         79542.0
 
 [3137 rows x 2 columns]]

In [287]:
join_cols = ["region_for_join", "district_for_join", "city_for_join", "settlement_for_join"]
selected_cols = ["region_name_cities", "district_name", "city_name", "settlement_name"]
rsmp[join_cols] = rsmp[selected_cols].apply(lambda x: x.str.replace("Ё", "Е").str.upper())
rsmp.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,


In [288]:
selected_cols = ["region", "area", "city", "settlement"]
cities[join_cols] = cities[selected_cols].apply(lambda x: x.str.upper().str.replace("Ё", "Е"))
cities["id"] = range(0, cities.shape[0])
cities_for_join = cities[["id"] + join_cols]
cities_for_join.head(2)

Unnamed: 0,id,region_for_join,district_for_join,city_for_join,settlement_for_join
0,0,АДЫГЕЯ,,АДЫГЕЙСК,
1,1,АДЫГЕЯ,,МАЙКОП,


In [289]:
cities_for_join.shape

(1129, 5)

In [290]:
merged_1 = rsmp.merge(
    cities_for_join,
    how="left",
    on=join_cols
)
merged_1.rename(columns={"id": "id_c1"}, inplace=True)
assert merged_1.shape[0] == rsmp_nrows

In [291]:
merged_1.shape

(172095, 36)

In [292]:
merged_1.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0


In [293]:
merged_1["id_c1"].notna().sum() / merged_1.shape[0]

0.852

In [294]:
cities_for_join = cities_for_join.loc[cities_for_join["settlement_for_join"].isna()]
assert cities_for_join.shape == cities_for_join.drop_duplicates().shape
cities_for_join.drop(columns="settlement_for_join", inplace=True)
cities_for_join.shape

(1127, 4)

In [295]:
merged_2 = merged_1.merge(
    cities_for_join,
    how="left",
    on=("region_for_join", "district_for_join", "city_for_join"),
)
merged_2.rename(columns={"id": "id_c2"}, inplace=True)
assert merged_2.shape[0] == rsmp_nrows
merged_2.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0


In [296]:
merged_2["id_c2"].notna().sum() / merged_2.shape[0]

0.879

In [297]:
merged_2["id_c2"].notna().sum() / merged_2.shape[0]

0.879

In [298]:
cities_for_join = cities.loc[(cities["area_type"] == "г") & cities["city"].isna(), ["id", "region_for_join", "district_for_join"]]
cities_for_join = cities_for_join.drop_duplicates(subset=("region_for_join", "district_for_join"))
cities_for_join.shape                                              

(10, 3)

In [299]:
merged_3 = merged_2.merge(
    cities_for_join,
    how="left",
    on=("region_for_join", "district_for_join"),
)
merged_3.rename(columns={"id": "id_c3"}, inplace=True)
assert merged_3.shape[0] == rsmp_nrows
merged_3.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0,


In [300]:
merged_3["id_c3"].notna().sum() / merged_3.shape[0]

0.007

In [301]:
cities_for_join.rename(columns={"district_for_join": "city_for_join"}, inplace=True)

In [302]:
merged_4 = merged_3.merge(
    cities_for_join,
    how="left",
    on=("region_for_join", "city_for_join"),
)
merged_4.rename(columns={"id": "id_c4"}, inplace=True)
assert merged_4.shape[0] == rsmp_nrows
merged_4.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0,,


In [303]:
merged_4["id_c4"].notna().sum() / merged_4.shape[0]

0.011

In [304]:
cities_for_join = cities.loc[(cities["area_type"] != "г") & cities["city"].notna() & cities["settlement"].isna(), ["id", "region_for_join", "city_for_join"]]
cities_for_join.drop_duplicates(["region_for_join", "city_for_join"], keep=False, inplace=True)
cities_for_join.head(2)

Unnamed: 0,id,region_for_join,city_for_join
0,0,АДЫГЕЯ,АДЫГЕЙСК
1,1,АДЫГЕЯ,МАЙКОП


In [305]:
merged_4_1 = merged_4.merge(
    cities_for_join,
    how="left",
    on=("region_for_join", "city_for_join"),
)
merged_4_1.rename(columns={"id": "id_c4_1"}, inplace=True)
assert merged_4_1.shape[0] == rsmp_nrows
merged_4_1.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4,id_c4_1
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0


In [306]:
merged_4 = merged_4_1

In [307]:
join_cols = ["region_for_join", "municipality_for_join", "settlement_for_join", "settlement_type_for_join"]
selected_cols = ["region_name_settlements", "district_name", "settlement_name", "settlement_type_norm"]
merged_4[join_cols] = merged_4[selected_cols].apply(lambda x: x.str.replace("Ё", "Е").str.upper())
assert merged_4.shape[0] == rsmp_nrows
merged_4.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4,id_c4_1,municipality_for_join,settlement_type_for_join
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,


In [308]:
socr_abbr_to_full = socr[["name", "name_full"]].drop_duplicates("name").rename(columns={"name": "type", "name_full": "settlement_type"})
socr_abbr_to_full.head(3)

Unnamed: 0,type,settlement_type
0,АО,Автономный округ
1,Аобл,Автономная область
2,г,Город


In [309]:
settlements_for_join = settlements[["id", "region", "municipality", "settlement", "type"]]
settlements_for_join_with_types = settlements_for_join.merge(socr_abbr_to_full, how="left", on="type")
assert settlements_for_join_with_types.shape[0] == settlements_for_join.shape[0]
settlements_for_join_with_types.head(2)

Unnamed: 0,id,region,municipality,settlement,type,settlement_type
0,0,Орловская область,Болховский,Колонтаева,д,Деревня
1,1,Республика Крым,Алушта,Пушкино,с,Село


In [310]:
selected_cols = ["region", "municipality", "settlement", "settlement_type"]
settlements_for_join_with_types[join_cols] = settlements_for_join_with_types[selected_cols].apply(lambda x: x.str.upper().str.replace("Ё", "Е"))
settlements_for_join_with_types.drop(columns=selected_cols + ["type"], inplace=True)
settlements_for_join_with_types.drop_duplicates(subset=join_cols, keep=False, inplace=True)
settlements_for_join_with_types.head(3)

Unnamed: 0,id,region_for_join,municipality_for_join,settlement_for_join,settlement_type_for_join
0,0,ОРЛОВСКАЯ ОБЛАСТЬ,БОЛХОВСКИЙ,КОЛОНТАЕВА,ДЕРЕВНЯ
1,1,РЕСПУБЛИКА КРЫМ,АЛУШТА,ПУШКИНО,СЕЛО
2,2,ЛИПЕЦКАЯ ОБЛАСТЬ,ЛЕВ-ТОЛСТОВСКИЙ РАЙОН,БАРЯТИНО,СЕЛО


In [311]:
settlements_for_join_with_types.shape

(150763, 5)

In [312]:
merged_5 = merged_4.merge(
    settlements_for_join_with_types,
    how="left",
    on=("region_for_join", "municipality_for_join", "settlement_for_join", "settlement_type_for_join"),
)
merged_5.rename(columns={"id": "id_s1"}, inplace=True)
assert merged_5.shape[0] == rsmp_nrows
merged_5.head(2) 

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4,id_c4_1,municipality_for_join,settlement_type_for_join,id_s1
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,


In [313]:
settlements_for_join = settlements_for_join_with_types.drop(columns="municipality_for_join")
settlements_for_join.drop_duplicates(subset=("region_for_join", "settlement_for_join", "settlement_type_for_join"), keep=False, inplace=True)
settlements_for_join.shape

(116245, 4)

In [314]:
merged_6 = merged_5.merge(
    settlements_for_join,
    how="left",
    on=("region_for_join", "settlement_for_join", "settlement_type_for_join"),
)
merged_6.rename(columns={"id": "id_s2"}, inplace=True)
assert merged_6.shape[0] == rsmp_nrows
merged_6.head(2) 

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4,id_c4_1,municipality_for_join,settlement_type_for_join,id_s1,id_s2
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,


In [315]:
settlements_for_join = settlements_for_join.drop(columns="settlement_type_for_join")
settlements_for_join.drop_duplicates(subset=("region_for_join", "settlement_for_join"), keep=False, inplace=True)
settlements_for_join.shape

(109739, 3)

In [316]:
merged_7 = merged_6.merge(
    settlements_for_join,
    how="left",
    on=("region_for_join", "settlement_for_join"),
)
merged_7.rename(columns={"id": "id_s3"}, inplace=True)
assert merged_7.shape[0] == rsmp_nrows
merged_7.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4,id_c4_1,municipality_for_join,settlement_type_for_join,id_s1,id_s2,id_s3
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,,


In [317]:
merged_7["municipality_for_join"] = merged_7["district_name"] + " " + merged_7["district_type_norm"].str.upper()
merged_7.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4,id_c4_1,municipality_for_join,settlement_type_for_join,id_s1,id_s2,id_s3
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,,


In [318]:
settlements_for_join = settlements_for_join_with_types.drop(columns=("settlement_type_for_join"))
settlements_for_join.drop_duplicates(subset=("region_for_join", "municipality_for_join", "settlement_for_join"), keep=False, inplace=True)
settlements_for_join.head(2)

Unnamed: 0,id,region_for_join,municipality_for_join,settlement_for_join
0,0,ОРЛОВСКАЯ ОБЛАСТЬ,БОЛХОВСКИЙ,КОЛОНТАЕВА
1,1,РЕСПУБЛИКА КРЫМ,АЛУШТА,ПУШКИНО


In [319]:
merged_8 = merged_7.merge(
    settlements_for_join,
    how="left",
    on=("region_for_join",  "municipality_for_join", "settlement_for_join"),
)
merged_8.rename(columns={"id": "id_s4"}, inplace=True)
assert merged_8.shape[0] == rsmp_nrows
merged_8.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,region_for_join,district_for_join,city_for_join,settlement_for_join,id_c1,id_c2,id_c3,id_c4,id_c4_1,municipality_for_join,settlement_type_for_join,id_s1,id_s2,id_s3,id_s4
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,РЕСПУБЛИКА БАШКОРТОСТАН,,УФА,,62.0,62.0,,,62.0,,,,,,


In [320]:
merged_8.notna().sum()

index                       172095
kind                        172095
category                    172095
ind_tin                      92966
ind_number                   16877
first_name                   92965
last_name                    92966
patronymic                   92511
org_name                     79129
org_short_name               78924
org_tin                      79129
org_number                    9939
region_code                 172095
region_name                 172075
region_type                 172075
district_name                28110
district_type                28110
city_name                   114816
city_type                   114816
settlement_name              20697
settlement_type              20697
activity_code_main          172095
start_date                  172095
end_date                     31906
region_type_norm            172075
district_type_norm           28110
city_type_norm              114816
settlement_type_norm         20692
region_name_cities  

In [323]:
merged = merged_8.drop(columns=[c for c in merged_8.columns if "_for_join" in c])
merged.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,id_c1,id_c2,id_c3,id_c4,id_c4_1,id_s1,id_s2,id_s3,id_s4
0,0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,62.0,62.0,,,62.0,,,,
1,1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,Республика,,Город,,Башкортостан,БАШКОРТОСТАН Республика,Республика Башкортостан,62.0,62.0,,,62.0,,,,


In [328]:
merged[[c for c in merged.columns if "id_" in c]].notna().sum(axis=1).value_counts()

3    117172
2     44157
1      6187
0      2958
4      1603
5        18
Name: count, dtype: int64

In [332]:
merged[[c for c in merged.columns if "id_" in c]].apply(lambda x: len(x.dropna().unique()), axis=1).value_counts()

1    165502
2      3618
0      2958
3        17
Name: count, dtype: int64

In [329]:
2958 / rsmp_nrows

0.017

In [344]:
def get_geo_id(row):
    for option in ("s1", "s2", "s3", "s4", "c1", "c2", "c4_1", "c3", "c4"):
        val = row[f"id_{option}"]
        if pd.isna(val):
            continue
        if option.startswith("c"):
            return pd.Series(dict(city_id=val, settlement_id=np.nan))
        else:
            return pd.Series(dict(city_id=np.nan, settlement_id=val))

In [345]:
merged.loc[:, ["city_id", "settlement_id"]] = merged.apply(get_geo_id, axis=1)

In [346]:
merged.notna().sum()

index                      172095
kind                       172095
category                   172095
ind_tin                     92966
ind_number                  16877
first_name                  92965
last_name                   92966
patronymic                  92511
org_name                    79129
org_short_name              78924
org_tin                     79129
org_number                   9939
region_code                172095
region_name                172075
region_type                172075
district_name               28110
district_type               28110
city_name                  114816
city_type                  114816
settlement_name             20697
settlement_type             20697
activity_code_main         172095
start_date                 172095
end_date                    31906
region_type_norm           172075
district_type_norm          28110
city_type_norm             114816
settlement_type_norm        20692
region_name_cities         172075
region_name_wi

In [356]:
merged.loc[merged["city_name"].notna() & merged["city_id"].isna()]

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_type_norm,district_type_norm,city_type_norm,settlement_type_norm,region_name_cities,region_name_with_type,region_name_settlements,id_c1,id_c2,id_c3,id_c4,id_c4_1,id_s1,id_s2,id_s3,id_s4,city_id,settlement_id
86,86,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АЛЬЯ...","ООО ""АЛЬЯНСПРОФКОНСАЛТ""",2.627027e+09,,26,СТАВРОПОЛЬСКИЙ,КРАЙ,,,ЖЕЛЕЗНОВОДСК,ГОРОД,ИНОЗЕМЦЕВО,ПОСЕЛОК,69.1,2020-02-10,,Край,,Город,Поселок,Ставропольский,СТАВРОПОЛЬСКИЙ Край,Ставропольский край,,892.0,,,892.0,,,133452.0,,,133452.0
149,149,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЮРИД...","ООО ""ЮФ ""АКЦЕПТ""",5.024199e+09,,50,МОСКОВСКАЯ,ОБЛАСТЬ,,,КРАСНОГОРСК,ГОРОД,ПУТИЛКОВО,ДЕРЕВНЯ,69.1,2019-10-10,,Область,,Город,Деревня,Московская,МОСКОВСКАЯ Область,Московская область,,539.0,,,539.0,,103159.0,103159.0,,,103159.0
375,375,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЦЕНТ...","ООО ""ТВОЁ ПРАВО""",9.105020e+09,,91,КРЫМ,РЕСП.,,,ЯЛТА,Г.,СИМЕИЗ,ПГТ.,69.1,2021-06-10,,Республика,,Город,Поселок городского типа,Крым,КРЫМ Республика,Республика Крым,,435.0,,,435.0,,99266.0,99266.0,,,99266.0
531,531,2,1,3.459193e+11,3.229401e+14,ОЛЬГА,МЕДВЕДЕВА,ВЛАДИМИРОВНА,,,,,94,ЛУГАНСКАЯ НАРОДНАЯ,РЕСП.,,,ЛУТУГИНО,Г.,,,69.1,2023-01-10,2023-02-10,Республика,,Город,,Курганская,ЛУГАНСКАЯ НАРОДНАЯ Республика,Республика Крым,,,,,,,,,,,
532,532,2,1,3.459193e+11,3.229401e+14,ОЛЬГА,МЕДВЕДЕВА,ВЛАДИМИРОВНА,,,,,94,ЛУГАНСКАЯ НАРОДНАЯ,РЕСП.,ЛУТУГИНСКИЙ,Р-Н,ЛУТУГИНО,Г.,,,69.1,2023-02-10,,Республика,Район,Город,,Курганская,ЛУГАНСКАЯ НАРОДНАЯ Республика,Республика Крым,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171834,171834,2,1,5.012150e+11,,АНДРЕЙ,ФЕДОРЕНКО,АЛЬБЕРТОВИЧ,,,,,50,МОСКОВСКАЯ,ОБЛАСТЬ,,,БАЛАШИХА,ГОРОД,ЖЕЛЕЗНОДОРОЖНЫЙ,МИКРОРАЙОН,69.1,2016-11-10,2018-06-10,Область,,Город,Микрорайон,Московская,МОСКОВСКАЯ Область,Московская область,,511.0,,,511.0,,,9237.0,,,9237.0
171853,171853,2,1,5.079012e+11,,ДЕНИС,КУНАЕВ,СЕРГЕЕВИЧ,,,,,50,МОСКОВСКАЯ,ОБЛАСТЬ,,,ШАХОВСКАЯ,РАБОЧИЙ ПОСЕЛОК,,,69.1,2017-05-10,2020-03-10,Область,,Рабочий поселок,,Московская,МОСКОВСКАЯ Область,Московская область,,,,,,,,,,,
171859,171859,2,1,5.225557e+11,,ЕКАТЕРИНА,БЕШЛЯГА,АЛЕКСАНДРОВНА,,,,,52,НИЖЕГОРОДСКАЯ,ОБЛАСТЬ,,,ДЗЕРЖИНСК,ГОРОД,ПЫРА,ПОСЕЛОК,69.1,2019-06-10,,Область,,Город,Поселок,Нижегородская,НИЖЕГОРОДСКАЯ Область,Нижегородская область,,611.0,,,611.0,,,93319.0,,,93319.0
172046,172046,2,1,7.727930e+11,,НАТАЛЬЯ,ЦЫБРЕНКОВА,ПЕТРОВНА,,,,,50,МОСКОВСКАЯ,ОБЛАСТЬ,НАРО-ФОМИНСКИЙ,РАЙОН,АПРЕЛЕВКА,ГОРОД,,,69.1,2016-08-10,,Область,Район,Город,,Московская,МОСКОВСКАЯ Область,Московская область,,,,,,,,,,,


In [361]:
merged.loc[~merged["region_name"].isin(("ДОНЕЦКАЯ НАРОДНАЯ", "ЛУГАНСКАЯ НАРОДНАЯ", "ХЕРСОНСКАЯ", "ЗАПОРОЖСКАЯ"))].notna().sum()

index                      171921
kind                       171921
category                   171921
ind_tin                     92806
ind_number                  16717
first_name                  92805
last_name                   92806
patronymic                  92351
org_name                    79115
org_short_name              78910
org_tin                     79115
org_number                   9925
region_code                171921
region_name                171901
region_type                171901
district_name               27985
district_type               27985
city_name                  114753
city_type                  114753
settlement_name             20687
settlement_type             20687
activity_code_main         171921
start_date                 171921
end_date                    31901
region_type_norm           171901
district_type_norm          27985
city_type_norm             114753
settlement_type_norm        20682
region_name_cities         171901
region_name_wi

In [362]:
(151610+17526)/rsmp_nrows

0.983

In [94]:
def remove_type_from_region_name(region_name):
    types = ["область", "Республика", "республика", "край", "автономный округ", "автономная область"]
    for t in types:
        region_name = region_name.replace(t, "")
        
    return region_name.strip()

remove_type_from_region_name("Республика Крым")

'Крым'

In [51]:
reg_to_reg_settl = pd.DataFrame.from_records(
    [
        (x, process.extractOne(x, settlements["region"].unique(), scorer=fuzz.token_set_ratio)[0].upper())
        for x in (merged_1["region_name"] + " " + merged_1["name_full"]).dropna().unique()
    ]
)

In [52]:
reg_to_reg_settl

Unnamed: 0,0,1
0,БАШКОРТОСТАН РЕСПУБЛИКА,РЕСПУБЛИКА БАШКОРТОСТАН
1,БУРЯТИЯ РЕСПУБЛИКА,РЕСПУБЛИКА БУРЯТИЯ
2,ДАГЕСТАН РЕСПУБЛИКА,РЕСПУБЛИКА ДАГЕСТАН
3,КАРЕЛИЯ РЕСПУБЛИКА,РЕСПУБЛИКА КАРЕЛИЯ
4,МАРИЙ ЭЛ РЕСПУБЛИКА,РЕСПУБЛИКА МАРИЙ ЭЛ
...,...,...
88,ХЕРСОНСКАЯ ОБЛАСТЬ,РОСТОВСКАЯ ОБЛАСТЬ
89,ЧУКОТСКИЙ АВТОНОМНЫЙ ОКРУГ,ЧУКОТСКИЙ АВТОНОМНЫЙ ОКРУГ
90,ХАНТЫ-МАНСИЙСКИЙ АВТОНОМНЫЙ ОКРУГ,ХАНТЫ-МАНСИЙСКИЙ АВТОНОМНЫЙ ОКРУГ - ЮГРА
91,ТАТАРСТАН (ТАТАРСТАН) РЕСПУБЛИКА,РЕСПУБЛИКА ТАТАРСТАН


In [58]:
reg_to_reg_settl.columns = ("region_orig", "region_for_join")
merged_1["region_orig"] = merged_1["region_name"] + " " + merged_1["name_full"]
merged_1 = merged_1.merge(reg_to_reg_settl, how="left", on="region_orig")

In [60]:
merged_1.head(3)

Unnamed: 0,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region_for_join_x,district_for_join,city_for_join,settlement_for_join,address,postal_code,country,federal_district,region_type_cities,region,area_type,area,city_type_cities,city,settlement_type_cities,settlement,kladr_id,fias_id,fias_level,capital_marker,okato,oktmo,tax_office,timezone,geo_lat,geo_lon,population,foundation_year,name,name_full,region_orig,region_for_join_y
0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,,БАШКОРТОСТАН,,УФА,,г Уфа,450000.0,Россия,Приволжский,Респ,Башкортостан,,,г,Уфа,,,200000100000.0,7339e834-2cb4-4734-a4c7-1fca2c66e562,4.0,2.0,80401000000.0,80701000000.0,200.0,UTC+5,54.734944,55.957847,1062300.0,1574.0,РЕСПУБЛИКА,РЕСПУБЛИКА,БАШКОРТОСТАН РЕСПУБЛИКА,РЕСПУБЛИКА БАШКОРТОСТАН
1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10,БАШКОРТОСТАН,,УФА,,г Уфа,450000.0,Россия,Приволжский,Респ,Башкортостан,,,г,Уфа,,,200000100000.0,7339e834-2cb4-4734-a4c7-1fca2c66e562,4.0,2.0,80401000000.0,80701000000.0,200.0,UTC+5,54.734944,55.957847,1062300.0,1574.0,РЕСПУБЛИКА,РЕСПУБЛИКА,БАШКОРТОСТАН РЕСПУБЛИКА,РЕСПУБЛИКА БАШКОРТОСТАН
2,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295.0,1080276000000.0,2,БАШКОРТОСТАН,РЕСП.,,,УФА,Г.,,,69.1,2022-03-10,,БАШКОРТОСТАН,,УФА,,г Уфа,450000.0,Россия,Приволжский,Респ,Башкортостан,,,г,Уфа,,,200000100000.0,7339e834-2cb4-4734-a4c7-1fca2c66e562,4.0,2.0,80401000000.0,80701000000.0,200.0,UTC+5,54.734944,55.957847,1062300.0,1574.0,РЕСП.,РЕСПУБЛИКА,БАШКОРТОСТАН РЕСПУБЛИКА,РЕСПУБЛИКА БАШКОРТОСТАН


In [61]:
merged_1["region_for_join"] = merged_1["region_for_join_y"]

In [55]:
#settlements["region_for_join"] = settlements["region"].apply(remove_type_from_region_name).str.upper()
settlements["region_for_join"] = settlements["region"].apply(lambda x: x.upper())

settlements["settlement_for_join"] = settlements["settlement"].str.upper()
settlements = settlements.loc[~settlements.duplicated(subset=["region_for_join", "settlement_for_join"], keep=False)]
settlements

Unnamed: 0,id,region,municipality,settlement,type,population,children,latitude_dms,longitude_dms,latitude_dd,longitude_dd,oktmo,dadata,rosstat,region_for_join,settlement_for_join
0,0,Орловская область,Болховский,Колонтаева,д,0,0,53.22.07,035.54.36,53.368611,35.910000,5.460442e+10,0,1,ОРЛОВСКАЯ ОБЛАСТЬ,КОЛОНТАЕВА
3,3,Тверская область,Селижаровский район,Хилово,д,2,0,56.54.20,033.25.09,56.905556,33.419167,2.865043e+10,0,1,ТВЕРСКАЯ ОБЛАСТЬ,ХИЛОВО
4,4,Томская область,Парабельский район,Басмасово,д,6,0,58.38.12,082.02.40,58.636667,82.044444,6.964444e+10,0,1,ТОМСКАЯ ОБЛАСТЬ,БАСМАСОВО
5,5,Республика Алтай,Шебалинский район,Каспа,с,375,118,51.06.57,086.00.46,51.115833,86.012778,8.465046e+10,0,1,РЕСПУБЛИКА АЛТАЙ,КАСПА
6,6,Нижегородская область,Воскресенский район,Апариха,д,0,0,56.53.40,045.19.08,56.894444,45.318889,2.262240e+10,0,1,НИЖЕГОРОДСКАЯ ОБЛАСТЬ,АПАРИХА
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155915,155915,Тюменская область,Тобольский район,Подъемка,с,21,0,57.58.44,067.58.38,57.978889,67.977222,7.164244e+10,0,1,ТЮМЕНСКАЯ ОБЛАСТЬ,ПОДЪЕМКА
155916,155916,Ульяновская область,Вешкаймский район,Ребровка,д,0,0,53.59.12,046.48.35,53.986667,46.809722,7.360744e+10,0,1,УЛЬЯНОВСКАЯ ОБЛАСТЬ,РЕБРОВКА
155917,155917,Псковская область,Палкинский район,Шадрица,д,1,0,57.29.03,028.08.48,57.484167,28.146667,5.863742e+10,0,1,ПСКОВСКАЯ ОБЛАСТЬ,ШАДРИЦА
155919,155919,Тульская область,рабочий пос. Новогуровский,Новогуровский,рп,3398,305,54.28.03,037.20.19,54.467500,37.338611,7.070200e+10,1,0,ТУЛЬСКАЯ ОБЛАСТЬ,НОВОГУРОВСКИЙ


In [62]:
merged_2 = merged_1.merge(
    settlements,
    how="left",
    on=["region_for_join", "settlement_for_join"],
    suffixes=("", "_settlements")
)
merged_2.shape

(172095, 70)

In [63]:
merged_2.columns

Index(['kind', 'category', 'ind_tin', 'ind_number', 'first_name', 'last_name',
       'patronymic', 'org_name', 'org_short_name', 'org_tin', 'org_number',
       'region_code', 'region_name', 'region_type', 'district_name',
       'district_type', 'city_name', 'city_type', 'settlement_name',
       'settlement_type', 'activity_code_main', 'start_date', 'end_date',
       'region_for_join_x', 'district_for_join', 'city_for_join',
       'settlement_for_join', 'address', 'postal_code', 'country',
       'federal_district', 'region_type_cities', 'region', 'area_type', 'area',
       'city_type_cities', 'city', 'settlement_type_cities', 'settlement',
       'kladr_id', 'fias_id', 'fias_level', 'capital_marker', 'okato', 'oktmo',
       'tax_office', 'timezone', 'geo_lat', 'geo_lon', 'population',
       'foundation_year', 'name', 'name_full', 'region_orig',
       'region_for_join_y', 'region_for_join', 'id', 'region_settlements',
       'municipality', 'settlement_settlements', 'type',
  

In [64]:
cols_to_drop = [
    "region_for_join", "district_for_join", "city_for_join", "settlement_for_join",
    "address", "postal_code", "country", "federal_district", "kladr_id", "fias_id",
    "fias_level", "capital_marker", "okato", "tax_office", "timezone", "population",
    "foundation_year", "id", "population_settlements", "children", "dadata", "rosstat"
]
merged_2.drop(columns=cols_to_drop, inplace=True)

In [65]:
(merged_2[["oktmo", "oktmo_settlements"]].notna().sum() / merged_2.shape[0]).sum()

0.9321246985676516

In [66]:
matched = merged_2.loc[merged_2["oktmo"].notna() | merged_2["oktmo_settlements"].notna()]
rest = merged_2.loc[merged_2["oktmo"].isna() & merged_2["oktmo_settlements"].isna()]
matched.shape, rest.shape

((160397, 48), (11698, 48))

In [67]:
rest = rest.loc[:, "kind":"end_date"]
rest.head(2)

Unnamed: 0,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date
63,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ВАЛЕ...","ООО ""ВАЛЕРИЯ""",2360014000.0,1212300000000.0,23,КРАСНОДАРСКИЙ,КРАЙ,КРЫЛОВСКИЙ,Р-Н,,,КРЫЛОВСКАЯ,СТ-ЦА,69.1,2021-08-10,
76,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ФИНА...","ООО ""ФИНАНСОВО-ЮРИДИЧЕСКОЕ БЮРО""",2508126000.0,,25,ПРИМОРСКИЙ,КРАЙ,,,НАХОДКА,ГОРОД,ВРАНГЕЛЬ,МИКРОРАЙОН,69.1,2016-08-10,2019-05-10


In [68]:
rest["region"] = rest["region_name"].apply(g._normalize_region).str.upper()
rest.shape

(11698, 24)

In [69]:
cities["id"] = range(0, cities.shape[0])
cities.head(2)

Unnamed: 0,address,postal_code,country,federal_district,region_type,region,area_type,area,city_type,city,settlement_type,settlement,kladr_id,fias_id,fias_level,capital_marker,okato,oktmo,tax_office,timezone,geo_lat,geo_lon,population,foundation_year,region_for_join,district_for_join,city_for_join,settlement_for_join,id
0,"Респ Адыгея, г Адыгейск",385200.0,Россия,Южный,Респ,Адыгея,,,г,Адыгейск,,,100000200000.0,ccdfd496-8108-4655-aadd-bd228747306d,4,0,79403000000.0,79703000000.0,107,UTC+3,44.878414,39.190289,12689,1969.0,АДЫГЕЯ,,АДЫГЕЙСК,,0
1,г Майкоп,385000.0,Россия,Южный,Респ,Адыгея,,,г,Майкоп,,,100000100000.0,8cfbe842-e803-49ca-9347-1ef90481dd98,4,2,79401000000.0,79701000000.0,105,UTC+3,44.609827,40.100661,144055,1857.0,АДЫГЕЯ,,МАЙКОП,,1


In [70]:
rj1 = rest.reset_index().merge(
    cities.loc[cities["settlement"].isna(), "region_for_join":"id"],
    how="left",
    left_on=("region", "district_name", "city_name"),
    right_on=("region_for_join", "district_for_join", "city_for_join"),
    suffixes=("", "_cj1")
)
rj1.drop(columns=[col for col in rj1.columns if "_join" in col], inplace=True)
rj1.rename(columns={"id": "id_cj1"}, inplace=True)
rj1.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1
0,63,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ВАЛЕ...","ООО ""ВАЛЕРИЯ""",2360014000.0,1212300000000.0,23,КРАСНОДАРСКИЙ,КРАЙ,КРЫЛОВСКИЙ,Р-Н,,,КРЫЛОВСКАЯ,СТ-ЦА,69.1,2021-08-10,,КРАСНОДАРСКИЙ,
1,76,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ФИНА...","ООО ""ФИНАНСОВО-ЮРИДИЧЕСКОЕ БЮРО""",2508126000.0,,25,ПРИМОРСКИЙ,КРАЙ,,,НАХОДКА,ГОРОД,ВРАНГЕЛЬ,МИКРОРАЙОН,69.1,2016-08-10,2019-05-10,ПРИМОРСКИЙ,721.0


In [71]:
rj1.shape

(11698, 26)

In [72]:
rj2 = rj1.merge(
    cities.loc[(cities["area_type"] == "г") & (cities["city"].isna()), "region_for_join":"id"],
    how="left",
    left_on=("region", "district_name"),
    right_on=("region_for_join", "district_for_join"),
    suffixes=("", "_cj2")
)
rj2.drop(columns=[col for col in rj2.columns if "_join" in col], inplace=True)
rj2.rename(columns={"id": "id_cj2"}, inplace=True)
rj2.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1,id_cj2
0,63,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ВАЛЕ...","ООО ""ВАЛЕРИЯ""",2360014000.0,1212300000000.0,23,КРАСНОДАРСКИЙ,КРАЙ,КРЫЛОВСКИЙ,Р-Н,,,КРЫЛОВСКАЯ,СТ-ЦА,69.1,2021-08-10,,КРАСНОДАРСКИЙ,,
1,76,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ФИНА...","ООО ""ФИНАНСОВО-ЮРИДИЧЕСКОЕ БЮРО""",2508126000.0,,25,ПРИМОРСКИЙ,КРАЙ,,,НАХОДКА,ГОРОД,ВРАНГЕЛЬ,МИКРОРАЙОН,69.1,2016-08-10,2019-05-10,ПРИМОРСКИЙ,721.0,


In [73]:
rj3 = rj2.merge(
    cities.loc[(cities["area_type"] == "г") & (cities["city"].isna()), "region_for_join":"id"],
    how="left",
    left_on=("region", "city_name"),
    right_on=("region_for_join", "district_for_join"),
    suffixes=("", "_cj3")
)
rj3.drop(columns=[col for col in rj3.columns if "_join" in col], inplace=True)
rj3.rename(columns={"id": "id_cj3"}, inplace=True)
rj3.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1,id_cj2,id_cj3
0,63,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ВАЛЕ...","ООО ""ВАЛЕРИЯ""",2360014000.0,1212300000000.0,23,КРАСНОДАРСКИЙ,КРАЙ,КРЫЛОВСКИЙ,Р-Н,,,КРЫЛОВСКАЯ,СТ-ЦА,69.1,2021-08-10,,КРАСНОДАРСКИЙ,,,
1,76,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ФИНА...","ООО ""ФИНАНСОВО-ЮРИДИЧЕСКОЕ БЮРО""",2508126000.0,,25,ПРИМОРСКИЙ,КРАЙ,,,НАХОДКА,ГОРОД,ВРАНГЕЛЬ,МИКРОРАЙОН,69.1,2016-08-10,2019-05-10,ПРИМОРСКИЙ,721.0,,


In [74]:
rj4 = rj3.merge(
    cities.loc[cities["settlement"].isna() & cities["city"].notna(), "region_for_join":"id"],
    how="left",
    left_on=("region", "city_name"),
    right_on=("region_for_join", "city_for_join"),
    suffixes=("", "_cj4")
)
rj4.drop(columns=[col for col in rj4.columns if "_join" in col], inplace=True)
rj4.rename(columns={"id": "id_cj4"}, inplace=True)
rj4.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1,id_cj2,id_cj3,id_cj4
0,63,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ВАЛЕ...","ООО ""ВАЛЕРИЯ""",2360014000.0,1212300000000.0,23,КРАСНОДАРСКИЙ,КРАЙ,КРЫЛОВСКИЙ,Р-Н,,,КРЫЛОВСКАЯ,СТ-ЦА,69.1,2021-08-10,,КРАСНОДАРСКИЙ,,,,
1,76,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ФИНА...","ООО ""ФИНАНСОВО-ЮРИДИЧЕСКОЕ БЮРО""",2508126000.0,,25,ПРИМОРСКИЙ,КРАЙ,,,НАХОДКА,ГОРОД,ВРАНГЕЛЬ,МИКРОРАЙОН,69.1,2016-08-10,2019-05-10,ПРИМОРСКИЙ,721.0,,,721.0


In [75]:
rj4 = rj4.merge(pd.concat(
        (socr[["name", "name_full"]].apply(lambda x: x.str.upper()).drop_duplicates("name"),
         pd.DataFrame({"name": socr["name_full"].drop_duplicates().str.upper(), "name_full": socr["name_full"].drop_duplicates().str.upper()})
        )
    ),
    how="left",
    left_on="district_type",
    right_on="name")
rj4["district_type"] = rj4["name_full"]

In [76]:
rj4["district_for_join"] = rj4["district_name"] + " " + rj4["district_type"]

In [77]:
settlements_for_rj4 = pd.read_csv("settlements_12032021/data.csv")
settlements_for_rj4["municipality_for_join"] = settlements_for_rj4["municipality"].str.upper()

In [78]:
settlements_for_rj4.drop_duplicates(subset=("municipality", "region", "settlement"), keep=False)

Unnamed: 0,id,region,municipality,settlement,type,population,children,latitude_dms,longitude_dms,latitude_dd,longitude_dd,oktmo,dadata,rosstat,municipality_for_join
0,0,Орловская область,Болховский,Колонтаева,д,0,0,53.22.07,035.54.36,53.368611,35.910000,5.460442e+10,0,1,БОЛХОВСКИЙ
1,1,Республика Крым,Алушта,Пушкино,с,273,0,44.35.45,034.20.27,44.595833,34.340833,3.570300e+10,0,1,АЛУШТА
2,2,Липецкая область,Лев-Толстовский район,Барятино,с,7,1,53.15.46,039.30.14,53.262778,39.503889,4.263641e+10,0,1,ЛЕВ-ТОЛСТОВСКИЙ РАЙОН
3,3,Тверская область,Селижаровский район,Хилово,д,2,0,56.54.20,033.25.09,56.905556,33.419167,2.865043e+10,0,1,СЕЛИЖАРОВСКИЙ РАЙОН
4,4,Томская область,Парабельский район,Басмасово,д,6,0,58.38.12,082.02.40,58.636667,82.044444,6.964444e+10,0,1,ПАРАБЕЛЬСКИЙ РАЙОН
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155916,155916,Ульяновская область,Вешкаймский район,Ребровка,д,0,0,53.59.12,046.48.35,53.986667,46.809722,7.360744e+10,0,1,ВЕШКАЙМСКИЙ РАЙОН
155917,155917,Псковская область,Палкинский район,Шадрица,д,1,0,57.29.03,028.08.48,57.484167,28.146667,5.863742e+10,0,1,ПАЛКИНСКИЙ РАЙОН
155918,155918,Псковская область,Себежский район,Овчинниково,д,0,0,56.19.48,028.26.17,56.330000,28.438056,5.865410e+10,0,1,СЕБЕЖСКИЙ РАЙОН
155919,155919,Тульская область,рабочий пос. Новогуровский,Новогуровский,рп,3398,305,54.28.03,037.20.19,54.467500,37.338611,7.070200e+10,1,0,РАБОЧИЙ ПОС. НОВОГУРОВСКИЙ


In [79]:
settlements_for_rj4["region_for_join"] = settlements_for_rj4["region"].apply(remove_type_from_region_name).str.upper()
settlements_for_rj4["settlement_for_join"] = settlements_for_rj4["settlement"].str.upper()

In [80]:
rj5 = rj4.merge(
    settlements_for_rj4.drop_duplicates(subset=("municipality", "region", "settlement"), keep=False),
    how="left",
    left_on=("region", "district_for_join", "settlement_name"),
    right_on=("region_for_join", "municipality_for_join", "settlement_for_join"),
    suffixes=("", "_cj5")
)
rj5.drop(columns=[col for col in rj5.columns if "_join" in col], inplace=True)
rj5.rename(columns={"id": "id_cj5"}, inplace=True)
rj5.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1,id_cj2,id_cj3,id_cj4,name,name_full,id_cj5,region_cj5,municipality,settlement,type,population,children,latitude_dms,longitude_dms,latitude_dd,longitude_dd,oktmo,dadata,rosstat
0,63,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ВАЛЕ...","ООО ""ВАЛЕРИЯ""",2360014000.0,1212300000000.0,23,КРАСНОДАРСКИЙ,КРАЙ,КРЫЛОВСКИЙ,РАЙОН,,,КРЫЛОВСКАЯ,СТ-ЦА,69.1,2021-08-10,,КРАСНОДАРСКИЙ,,,,,Р-Н,РАЙОН,109043.0,Краснодарский край,Крыловский район,Крыловская,ст-ца,12725.0,2376.0,46.19.15,039.57.18,46.320833,39.955,3624411000.0,0.0,1.0
1,76,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ФИНА...","ООО ""ФИНАНСОВО-ЮРИДИЧЕСКОЕ БЮРО""",2508126000.0,,25,ПРИМОРСКИЙ,КРАЙ,,,НАХОДКА,ГОРОД,ВРАНГЕЛЬ,МИКРОРАЙОН,69.1,2016-08-10,2019-05-10,ПРИМОРСКИЙ,721.0,,,721.0,,,,,,,,,,,,,,,,


In [81]:
rj5["district_for_join"] = rj5["district_name"]
rj5["city_for_join"] = rj5["city_type"] + " " + rj5["city_name"]

In [82]:
rj6 = rj5.merge(
    settlements_for_rj4.drop_duplicates(subset=("municipality", "region", "settlement"), keep=False),
    how="left",
    left_on=("region", "district_for_join", "settlement_name"),
    right_on=("region_for_join", "municipality_for_join", "settlement_for_join"),
    suffixes=("", "_cj6")
)
rj6.drop(columns=[col for col in rj6.columns if "_join" in col], inplace=True)
rj6.rename(columns={"id": "id_cj6"}, inplace=True)
rj6.head(2)

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1,id_cj2,id_cj3,id_cj4,name,name_full,id_cj5,region_cj5,municipality,settlement,type,population,children,latitude_dms,longitude_dms,latitude_dd,longitude_dd,oktmo,dadata,rosstat,id_cj6,region_cj6,municipality_cj6,settlement_cj6,type_cj6,population_cj6,children_cj6,latitude_dms_cj6,longitude_dms_cj6,latitude_dd_cj6,longitude_dd_cj6,oktmo_cj6,dadata_cj6,rosstat_cj6
0,63,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ВАЛЕ...","ООО ""ВАЛЕРИЯ""",2360014000.0,1212300000000.0,23,КРАСНОДАРСКИЙ,КРАЙ,КРЫЛОВСКИЙ,РАЙОН,,,КРЫЛОВСКАЯ,СТ-ЦА,69.1,2021-08-10,,КРАСНОДАРСКИЙ,,,,,Р-Н,РАЙОН,109043.0,Краснодарский край,Крыловский район,Крыловская,ст-ца,12725.0,2376.0,46.19.15,039.57.18,46.320833,39.955,3624411000.0,0.0,1.0,,,,,,,,,,,,,,
1,76,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ФИНА...","ООО ""ФИНАНСОВО-ЮРИДИЧЕСКОЕ БЮРО""",2508126000.0,,25,ПРИМОРСКИЙ,КРАЙ,,,НАХОДКА,ГОРОД,ВРАНГЕЛЬ,МИКРОРАЙОН,69.1,2016-08-10,2019-05-10,ПРИМОРСКИЙ,721.0,,,721.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [83]:
rj6.notna().sum()

index                 11698
kind                  11698
category              11698
ind_tin                8184
ind_number             1657
first_name             8184
last_name              8184
patronymic             8130
org_name               3514
org_short_name         3498
org_tin                3514
org_number              491
region_code           11698
region_name           11678
region_type           11678
district_name          7966
district_type          7966
city_name              6203
city_type              6203
settlement_name        6517
settlement_type        6517
activity_code_main    11698
start_date            11698
end_date               4720
region                11489
id_cj1                 2534
id_cj2                  218
id_cj3                 1711
id_cj4                 3824
name                   7966
name_full              7966
id_cj5                 1481
region_cj5             1481
municipality           1481
settlement             1481
type                

In [84]:
rj6[[c for c in rj6.columns if "id_cj" in c]].notna().sum(axis=1).value_counts()[0] / rsmp.shape[0]

0.018925593422237718

In [85]:
rj6.loc[rj6[[c for c in rj6.columns if "id_cj" in c]].notna().sum(axis=1) == 0]

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1,id_cj2,id_cj3,id_cj4,name,name_full,id_cj5,region_cj5,municipality,settlement,type,population,children,latitude_dms,longitude_dms,latitude_dd,longitude_dd,oktmo,dadata,rosstat,id_cj6,region_cj6,municipality_cj6,settlement_cj6,type_cj6,population_cj6,children_cj6,latitude_dms_cj6,longitude_dms_cj6,latitude_dd_cj6,longitude_dd_cj6,oktmo_cj6,dadata_cj6,rosstat_cj6
2,104,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""КУАТ...","ООО ""КУАТТРО""",3.510009e+09,,35,ВОЛОГОДСКАЯ,ОБЛАСТЬ,КАДУЙСКИЙ,РАЙОН,,,КАДУЙ,РАБОЧИЙ ПОСЕЛОК,69.1,2016-08-10,,ВОЛОГОДСКАЯ,,,,,РАЙОН,РАЙОН,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,312,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АНЕЛАН""","ООО ""АНЕЛАН""",7.733244e+09,,77,МОСКВА,ГОРОД,КРАСНОПАХОРСКОЕ,ПОСЕЛЕНИЕ,,,БЫЛОВО,СЕЛО,69.1,2016-11-10,,МОСКВА,,,,,ПОСЕЛЕНИЕ,ПОСЕЛЕНИЕ,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12,454,2,1,2.245023e+11,3.222202e+14,РУСЛАН,ГАДЖИЕВ,ЛАГМАН ОГЛЫ,,,,,22,АЛТАЙСКИЙ,КРАЙ,ЗОНАЛЬНЫЙ,РАЙОН,,,БУЛАНИХА,С.,69.1,2022-11-10,,АЛТАЙСКИЙ,,,,,Р-Н,РАЙОН,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13,475,2,1,2.437009e+11,,АНЖЕЛА,ФИЛИМОНОВА,КОНСТАНТИНОВНА,,,,,24,КРАСНОЯРСКИЙ,КРАЙ,НОВОСЕЛОВСКИЙ,РАЙОН,,,НОВОСЕЛОВО,СЕЛО,69.1,2017-04-10,,КРАСНОЯРСКИЙ,,,,,РАЙОН,РАЙОН,,,,,,,,,,,,,,,,,,,,,,,,,,,,
15,531,2,1,3.459193e+11,3.229401e+14,ОЛЬГА,МЕДВЕДЕВА,ВЛАДИМИРОВНА,,,,,94,ЛУГАНСКАЯ НАРОДНАЯ,РЕСП.,,,ЛУТУГИНО,Г.,,,69.1,2023-01-10,2023-02-10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11684,171952,2,1,6.410018e+11,3.226457e+14,ЖАНАТ,ЖАНТАЛИЕВ,КАНАТОВИЧ,,,,,64,САРАТОВСКАЯ,ОБЛ.,ДЕРГАЧЕВСКИЙ,РАЙОН,,,ВОСТОЧНЫЙ,П.,69.1,2022-06-10,,САРАТОВСКАЯ,,,,,Р-Н,РАЙОН,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11691,172004,2,1,7.132103e+11,,ВЛАДИМИР,ИСАЕВ,НИКОЛАЕВИЧ,,,,,71,ТУЛЬСКАЯ,ОБЛАСТЬ,ЧЕРНСКИЙ,РАЙОН,,,ЧЕРНЬ,РАБОЧИЙ ПОСЕЛОК,69.1,2016-08-10,,ТУЛЬСКАЯ,,,,,РАЙОН,РАЙОН,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11692,172018,2,1,7.452040e+11,,МАКСИМ,БИККУЛОВ,РАМИЛОВИЧ,,,,,74,ЧЕЛЯБИНСКАЯ,ОБЛАСТЬ,СОСНОВСКИЙ,РАЙОН,,,СМОЛИНО Ж-Д. СТ.,ПОСЕЛОК,69.1,2016-08-10,,ЧЕЛЯБИНСКАЯ,,,,,РАЙОН,РАЙОН,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11694,172052,2,1,7.729914e+11,3.227746e+14,АЛЕКСЕЙ,БОЙКО,ВАЛЕРЬЕВИЧ,,,,,77,МОСКВА,Г.,ВНУКОВСКОЕ,ПОСЕЛОК,,,ИЗВАРИНО,Д.,69.1,2022-06-10,,МОСКВА,,,,,П.,ПОСЕЛОК,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [86]:
pd.DataFrame(rj6.loc[rj6[[c for c in rj6.columns if "id_cj" in c]].notna().sum(axis=1) == 0]["city_name"].value_counts()).head(20)

Unnamed: 0_level_0,count
city_name,Unnamed: 1_level_1
МОСКОВСКИЙ,167
МАРКОВА,37
ШАХОВСКАЯ,26
МАЛАХОВКА,22
ПРОМЫШЛЕННАЯ,22
КЛИМОВСК,20
КОЛЬЦОВО,19
ТОМИЛИНО,17
КРАСКОВО,15
МАКЕЕВКА,14


In [87]:
rj6.loc[rj6["city_name"] == "МОСКОВСКИЙ"]

Unnamed: 0,index,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date,region,id_cj1,id_cj2,id_cj3,id_cj4,name,name_full,id_cj5,region_cj5,municipality,settlement,type,population,children,latitude_dms,longitude_dms,latitude_dd,longitude_dd,oktmo,dadata,rosstat,id_cj6,region_cj6,municipality_cj6,settlement_cj6,type_cj6,population_cj6,children_cj6,latitude_dms_cj6,longitude_dms_cj6,latitude_dd_cj6,longitude_dd_cj6,oktmo_cj6,dadata_cj6,rosstat_cj6
71,1298,2,1,2.129142e+11,,ЮЛИЯ,ВДОВИНА,ЛЕОНИДОВНА,,,,,77,МОСКВА,Г.,МОСКОВСКИЙ,ПОСЕЛОК,МОСКОВСКИЙ,Г.,,,69.1,2021-03-10,,МОСКВА,,,,,П.,ПОСЕЛОК,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107,1651,2,1,7.729720e+11,,ЮЛИЯ,СЕРГЕЕВА,СЕРГЕЕВНА,,,,,77,МОСКВА,ГОРОД,МОСКОВСКИЙ,ПОСЕЛЕНИЕ,МОСКОВСКИЙ,ГОРОД,,,69.1,2021-06-10,,МОСКВА,,,,,ПОСЕЛЕНИЕ,ПОСЕЛЕНИЕ,,,,,,,,,,,,,,,,,,,,,,,,,,,,
242,3828,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""СОНАР""","ООО ""СОНАР""",7.751108e+09,,77,МОСКВА,ГОРОД,,,МОСКОВСКИЙ,ГОРОД,,,69.1,2017-12-10,,МОСКВА,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
420,6439,2,1,5.190065e+10,,НАДИР,КАЗИЕВ,ЗАМИРОВИЧ,,,,,77,МОСКВА,ГОРОД,МОСКОВСКИЙ,ПОСЕЛЕНИЕ,МОСКОВСКИЙ,ГОРОД,,,69.1,2021-04-10,,МОСКВА,,,,,ПОСЕЛЕНИЕ,ПОСЕЛЕНИЕ,,,,,,,,,,,,,,,,,,,,,,,,,,,,
425,6549,2,1,3.435172e+11,3.227746e+14,ЕКАТЕРИНА,БАРАНОВА,ВИКТОРОВНА,,,,,77,МОСКВА,Г.,МОСКОВСКИЙ,ПОСЕЛОК,МОСКОВСКИЙ,Г.,,,69.1,2022-02-10,,МОСКВА,,,,,П.,ПОСЕЛОК,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11323,166411,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ПРОФ...","ООО ""ПРОФИТ-ПЛЮС""",7.751068e+09,,77,МОСКВА,ГОРОД,,,МОСКОВСКИЙ,ГОРОД,,,69.1,2017-09-10,,МОСКВА,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11325,166413,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ПРАВ...","ООО ""ПРАВОВАЯ КОМПАНИЯ ""АСПЕКТ""",7.751178e+09,,77,МОСКВА,ГОРОД,МОСКОВСКИЙ,ПОСЕЛЕНИЕ,МОСКОВСКИЙ,ГОРОД,,,69.1,2020-04-10,,МОСКВА,,,,,ПОСЕЛЕНИЕ,ПОСЕЛЕНИЕ,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11472,168508,2,1,5.603058e+11,,АЛЬБЕРТ,ГИМАТДИНОВ,АЛЬБЕРТОВИЧ,,,,,77,МОСКВА,ГОРОД,МОСКОВСКИЙ,ПОСЕЛЕНИЕ,МОСКОВСКИЙ,ГОРОД,,,69.1,2019-05-10,,МОСКВА,,,,,ПОСЕЛЕНИЕ,ПОСЕЛЕНИЕ,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11486,168631,2,1,7.720731e+11,,ВЛАДИСЛАВ,ЕФИМОВИЧ,ИГОРЕВИЧ,,,,,77,МОСКВА,ГОРОД,МОСКОВСКИЙ,ПОСЕЛЕНИЕ,МОСКОВСКИЙ,ГОРОД,,,69.1,2019-06-10,,МОСКВА,,,,,ПОСЕЛЕНИЕ,ПОСЕЛЕНИЕ,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [139]:
cities.loc[cities["city_for_join"] == "ЩЁЛКОВО"]["city_for_join"].str.replace("Ё", "Е")

579    ЩЕЛКОВО
Name: city_for_join, dtype: object

In [195]:
(6205 - 829 - 1980) / rsmp.shape[0]

0.019733286847380807

In [33]:
regions_standard = cities["region"].dropna().unique()
def get_region(name):
    if pd.isna(name):
        return np.nan
    
    if name in regions_standard:
        return name
    
    match, score = process.extractOne(name, regions_standard)
    if score > 80:
        return match
    
    return np.nan

unique_regions = rest["region_name"].dropna().unique()
regions_mapping = {
    "region_name": unique_regions,
    "region_for_join": [get_region(x) for x in unique_regions],
}
regions_mapping_table = pd.DataFrame(regions_mapping)
regions_mapping_table.isna().sum()

region_name        0
region_for_join    5
dtype: int64

In [34]:
class Geonormalizer:
    THRES = 80
    
    def __init__(self):
        self._settlements = pd.read_csv("settlements_12032021/data.csv")
        self._cities = pd.read_csv("city.csv")
        self._regions = None
        self._cities_index = None
        self._settlements_standard = None
        self._addresses_cache = {}
        self._regions_cache = {}
        
        self._get_regions()
        self._prepare_cities()
        self._prepare_settlements()        
    
    def _get_regions(self):
        self._regions = self._cities["region"].unique()
        
    def _prepare_cities(self):
        cols_to_uppercase = ["region", "area", "city", "settlement"]
        new_cols = [f"{x}_search" for x in cols_to_uppercase]
        self._cities[new_cols] = self._cities[cols_to_uppercase].apply(lambda x: x.str.upper())
    
    def _prepare_settlements(self):
        self._settlements["region_name"] = self._settlements["region"].apply(self._normalize_region)
        cols_to_uppercase = ["region_name", "municipality", "settlement"]
        new_cols = [f"{x}_search" for x in cols_to_uppercase]
        self._settlements[new_cols] = self._settlements[cols_to_uppercase].apply(lambda x: x.str.upper())
        
    def _normalize_region(self, region_name: str) -> str:
        if region_name == "" or pd.isna(region_name):
            return None
        
        cached_region = self._regions_cache.get(region_name)
        if cached_region:
            return cached_region
        
        types = ["область", "Республика", "республика", "край", "автономный округ", "автономная область"]
        for t in types:
            search = region_name.replace(t, "")

        search = region_name.strip()
        
        match, score = process.extractOne(search, self._regions)
        if score > self.THRES:
            self._regions_cache[region_name] = match
            return match
        
        return pd.NA
    
    def _search_by_area_and_city(self, region_name, row):
        cond = (self._cities["region"] == region_name)
        if row["district_name"] != "":
            cond = cond & (self._cities["area_search"] == row["district_name"])
        if row["city_name"] != "":
            cond = cond & (self._cities["city_search"] == row["city_name"])

        candidates = self._cities.loc[cond]
        if candidates.shape[0] == 1:
            result_row = candidates.iloc[0]
            return dict(
                region=result_row["region"],
                area=result_row["area"],
                city=result_row["city"],
                oktmo=result_row["oktmo"],
                method="a+c"
            )
        
        return None
    
    def _search_by_city(self, region_name, row):
        cond = (self._cities["region"] == region_name)
        if row["city_name"] != "":
            cond = cond & (self._cities["city_search"] == row["city_name"])
        else:
            return None

        candidates = self._cities.loc[cond]
        if candidates.shape[0] == 1:
            result_row = candidates.iloc[0]
            return dict(
                region=result_row["region"],
                area=result_row["area"],
                city=result_row["city"],
                oktmo=result_row["oktmo"],
                method="c"
            )
        
        return None
    
    def _search_by_area(self, region_name, row):
        cond = (self._cities["region"] == region_name)
        if row["district_name"] != "":
            cond = cond & (self._cities["area_search"] == row["district_name"])
        else:
            return None

        candidates = self._cities.loc[cond]
        if candidates.shape[0] == 1:
            result_row = candidates.iloc[0]
            return dict(
                region=result_row["region"],
                area=result_row["area"],
                city=result_row["city"],
                oktmo=result_row["oktmo"],
                method="a"
            )
        
        return None
    
    def _search_by_mun_and_settlement(self, region_name, row):
        cond = (self._settlements["region"] == region_name)
        if row["district_name"] != "":
            cond = cond & (self._settlements["municipality_search"] == row["district_name"])
        if row["settlement_name"] != "":
            cond = cond & (self._settlements["settlement_search"] == row["settlement_name"])

        candidates = self._settlements.loc[cond]
        if candidates.shape[0] == 1:
            result_row = candidates.iloc[0]
            return dict(
                region=result_row["region"],
                area=result_row["municipality"],
                city=result_row["settlement"],
                oktmo=result_row["oktmo"],
                method="m+s"
            )
        
        return None
        
    def _fuzzy_search_in_cities(self, region_name, row):
        candidates = self._cities.loc[self._cities["region"] == region_name.upper(), "area":"settlement_type"]
        if candidates.shape[0] == 0:
            return
        
        search = " ".join(row.to_list())
        candidates["score"] = candidates.apply(
            lambda x: fuzz.token_sort_ratio(" ".join(x.fillna("").to_list()), search),
            axis=1,
        )
        candidates = candidates.sort_values("score", ascending=False)
        return candidates
        
        if candidates.iloc[0]["score"] > self.THRES:
            result_row = candidates.iloc[0]
            return dict(
                region=result_row["region"],
                area=result_row["area"],
                city=result_row["city"],
                oktmo=result_row["oktmo"],
                method="f_c"
            )
        
    def _fuzzy_search_in_settlements(self, region_name, row):
        settlement_name = row["settlement_name"].upper()
        cond = (
            (self._settlements["region_name_search"] == str(region_name).upper())
            & (self._settlements["settlement_search"] == settlement_name)
        )
        candidates = self._settlements.loc[cond, "municipality":"settlement"]
        if candidates.shape[0] == 0:
            return
        
        search = f"{row['district_name']} {row['city_name']} {row['settlement_name']}"
        candidates["score"] = candidates.apply(
            lambda x: fuzz.token_sort_ratio(" ".join(x.fillna("").to_list()), search),
            axis=1,
        )
        candidates = candidates.sort_values("score", ascending=False)
        print(candidates)
        
        if candidates.iloc[0]["score"] > self.THRES:
            result_row = candidates.iloc[0]
            return dict(
                #region=result_row["region"],
                area=result_row["municipality"],
                settlement=result_row["settlement"],
                #oktmo=result_row["oktmo"],
                method="f_s"
            )
    
    def process_address(self, row: pd.Series) -> dict:
        row = row["region_name":"settlement_type"].fillna("")

        cache_index = "_".join(row.to_list())
        cached_result = self._addresses_cache.get(cache_index)
        if cached_result:
            return cached_result
        
        region_norm = self._normalize_region(row["region_name"])
        
        # Apply different search methods until the result is found
        methods = (
            #self._search_by_area_and_city,
           # self._search_by_city,
            #self._search_by_area,
            #self._search_by_mun_and_settlement,
           # self._fuzzy_search_in_cities,
            self._fuzzy_search_in_settlements,
        )
        
        for method in methods:
            result = method(region_norm, row)
            if result is not None:
                return result
            
        
    
        

In [35]:
g = Geonormalizer()