In [1]:
from fuzzywuzzy import fuzz, process
import numpy as np
import pandas as pd
import tqdm

In [2]:
pd.set_option("display.max_columns", None)
%precision 3

'%.3f'

In [3]:
rsmp = pd.read_csv("rsmp/csv/data.csv", dtype=str)
settlements = pd.read_csv("assets/settlements.csv")
cities_base = pd.read_csv("assets/cities.csv")
cities_additional = pd.read_csv("assets/cities_additional.csv")
socr = pd.read_csv("assets/socrbase.csv")

In [4]:
cities = pd.concat((cities_base, cities_additional))
cities.reset_index(drop=True)
cities["id"] = range(0, cities.shape[0])

rsmp["id"] = range(0, rsmp.shape[0])

In [5]:
socr_full_to_full = socr[["name_full", "name_full"]]
socr_full_to_full.columns = ("name", "name_full")
socr_full_to_full.head(3)

Unnamed: 0,name,name_full
0,Автономный округ,Автономный округ
1,Автономная область,Автономная область
2,Город,Город


In [6]:
socr_without_dot = socr.loc[~socr["name"].str.endswith("."), ["name", "name_full"]]
socr_without_dot["name"] = socr_without_dot["name"] + "."
socr_without_dot.head(3)

Unnamed: 0,name,name_full
0,АО.,Автономный округ
1,Аобл.,Автономная область
2,г.,Город


In [7]:
abbr_to_full = pd.concat(
    (socr[["name", "name_full"]], socr_full_to_full, socr_without_dot)
)
abbr_to_full["name"] = abbr_to_full["name"].str.upper()
abbr_to_full.drop_duplicates("name", inplace=True)
abbr_to_full.head(10)

Unnamed: 0,name,name_full
0,АО,Автономный округ
1,АОБЛ,Автономная область
2,Г,Город
3,КРАЙ,Край
4,ОБЛ,Область
5,РЕСП,Республика
6,ОКРУГ,Округ
7,ЧУВАШИЯ,Чувашия
8,А.ОБЛ.,Автономная область
9,А.ОКР.,Автономный округ


In [8]:
address_cols = [
    "region_name",
    "region_type",
    "district_name",
    "district_type",
    "city_name",
    "city_type",
    "settlement_name",
    "settlement_type",
]
addresses = rsmp.loc[:, ["id"] + address_cols]
addresses.head(2)

Unnamed: 0,id,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type
0,0,ТВЕРСКАЯ,ОБЛАСТЬ,,,КИМРЫ,ГОРОД,,
1,1,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,


In [9]:
addr_count = len(addresses)
addr_count

163900

In [10]:
addresses.isna().sum()

id                      0
region_name            19
region_type            19
district_name      137368
district_type      137369
city_name           57806
city_type           57806
settlement_name    144288
settlement_type    144288
dtype: int64

In [11]:
def join_name_and_type(name, type_):
    if pd.isna(name) or pd.isna(type_):
        return np.nan
    
    prepend_types = (
        "Город", "Республика", "Поселок", "Поселок городского типа", "Рабочий поселок"
    )
    prepend = type_ in prepend_types
    if prepend:
        return f"{type_} {name}"
    
    return f"{name} {type_}"

In [12]:
def preprocess_text_column(c):
    return c.str.upper().str.replace("Ё", "Е")

In [13]:
for option in ("region", "district", "city", "settlement"):
    target_col = f"{option}_type"
    addresses = addresses.merge(
        abbr_to_full,
        how="left",
        left_on=target_col,
        right_on="name",
    )
    addresses[target_col] = addresses["name_full"]
    addresses.drop(columns=abbr_to_full.columns, inplace=True)
    assert len(addresses) == addr_count, (
        f"Number of addresses must not change, but for {target_col} "
        f"the size has changed: {addr_count} -> {len(addresses)}"
    )
    
    parts = [f"{option}_name", f"{option}_type"]
    addresses[option] = addresses[parts].apply(
        lambda row: join_name_and_type(row[parts[0]], row[parts[1]]),
        axis=1
    )
addresses.isna().sum()

id                      0
region_name            19
region_type            20
district_name      137368
district_type      137369
city_name           57806
city_type           57806
settlement_name    144288
settlement_type    144292
region                 20
district           137369
city                57806
settlement         144292
dtype: int64

In [14]:
unique_region_names = addresses["region_name"].dropna().unique()
unique_regions_in_cities = cities["region"].dropna().unique()
region_names_to_cities = pd.DataFrame({
    "region_name": unique_region_names,
    "region_cities": [
        process.extractOne(region_name, unique_regions_in_cities)[0]
        for region_name in unique_region_names
    ]
})
region_names_to_cities.head(3)

Unnamed: 0,region_name,region_cities
0,ТВЕРСКАЯ,Тверская
1,БАШКОРТОСТАН,Башкортостан
2,БУРЯТИЯ,Бурятия


In [15]:
unique_regions = addresses["region"].dropna().unique()
unique_regions_in_settlements = settlements["region"].dropna().unique()
regions_to_settlements = pd.DataFrame({
    "region": unique_regions,
    "region_settlements": [
        process.extractOne(
            region_name,
            unique_regions_in_settlements,
            scorer=fuzz.token_set_ratio
        )[0]
        for region_name in unique_regions
    ]
})
regions_to_settlements.head(3)

Unnamed: 0,region,region_settlements
0,ТВЕРСКАЯ Область,Тверская область
1,Республика БАШКОРТОСТАН,Республика Башкортостан
2,Республика БУРЯТИЯ,Республика Бурятия


In [16]:
addresses = addresses.merge(region_names_to_cities, how="left", on="region_name")
addresses = addresses.merge(regions_to_settlements, how="left", on="region")
addresses.iloc[:, 1:] = addresses.iloc[:, 1:].apply(preprocess_text_column)
assert len(addresses) == addr_count
addresses.head(2)

Unnamed: 0,id,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,region,district,city,settlement,region_cities,region_settlements
0,0,ТВЕРСКАЯ,ОБЛАСТЬ,,,КИМРЫ,ГОРОД,,,ТВЕРСКАЯ ОБЛАСТЬ,,ГОРОД КИМРЫ,,ТВЕРСКАЯ,ТВЕРСКАЯ ОБЛАСТЬ
1,1,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,РЕСПУБЛИКА БАШКОРТОСТАН,,ГОРОД УФА,,БАШКОРТОСТАН,РЕСПУБЛИКА БАШКОРТОСТАН


In [17]:
std_c = cities[["id", "region", "area", "city", "settlement"]]
cities_from_areas = cities.loc[(cities["area_type"] == "г") & (cities["city"].isna())].copy()
cities_from_areas["city"] = cities_from_areas["area"]
cities_from_areas["area"] = np.nan
cities_from_areas = cities_from_areas[["id", "region", "area", "city", "settlement"]]
std_c = pd.concat((std_c, cities_from_areas))

std_c.iloc[:, 1:] = std_c.iloc[:, 1:].apply(preprocess_text_column)
std_c.head(3)

Unnamed: 0,id,region,area,city,settlement
0,0,АДЫГЕЯ,,АДЫГЕЙСК,
1,1,АДЫГЕЯ,,МАЙКОП,
2,2,АЛТАЙ,,ГОРНО-АЛТАЙСК,


In [18]:
std_s = settlements.loc[:, ["id", "region", "municipality", "settlement", "type"]]
std_s["type"] = std_s["type"].str.upper()
std_s = std_s.merge(
    abbr_to_full,
    how="left",
    left_on="type",
    right_on="name"
)
std_s["type"] = std_s["name_full"]
std_s.drop(columns=abbr_to_full.columns, inplace=True)
std_s.iloc[:, 1:] = std_s.iloc[:, 1:].apply(preprocess_text_column)
std_s.head(3)

Unnamed: 0,id,region,municipality,settlement,type
0,0,ОРЛОВСКАЯ ОБЛАСТЬ,БОЛХОВСКИЙ,КОЛОНТАЕВА,ДЕРЕВНЯ
1,1,РЕСПУБЛИКА КРЫМ,АЛУШТА,ПУШКИНО,СЕЛО
2,2,ЛИПЕЦКАЯ ОБЛАСТЬ,ЛЕВ-ТОЛСТОВСКИЙ РАЙОН,БАРЯТИНО,СЕЛО


In [19]:
merge_options = [
    {
        "name": "Settlements by all parts with full district name",
        "addresses": ["region_settlements", "district", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements"
    },
    {
        "name": "Settlements by all parts with partial district name (no type)",
        "addresses": ["region_settlements", "district_name", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts with full city name",
        "addresses": ["region_settlements", "city", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts with partial city name (no type)",
        "addresses": ["region_settlements", "city_name", "settlement_name", "settlement_type"],
        "standard": ["region", "municipality", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with full district name",
        "addresses": ["region_settlements", "district", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with partial district name",
        "addresses": ["region_settlements", "district_name", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with full city name",
        "addresses": ["region_settlements", "city", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by all parts except for type with partial city name",
        "addresses": ["region_settlements", "city_name", "settlement_name"],
        "standard": ["region", "municipality", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Settlements by region and settlement with type",
        "addresses": ["region_settlements", "settlement_name", "settlement_type"],
        "standard": ["region", "settlement", "type"],
        "type": "settlements",
    },
    {
        "name": "Settlements by region and settlement without type",
        "addresses": ["region_settlements", "settlement_name"],
        "standard": ["region", "settlement"],
        "type": "settlements",
    },
    {
        "name": "Cities by all parts",
        "addresses": ["region_cities", "district_name", "city_name", "settlement_name"],
        "standard": ["region", "area", "city", "settlement"],
        "type": "cities",
    },
    {
        "name": "Cities by all parts except for settlements",
        "addresses": ["region_cities", "district_name", "city_name"],
        "standard": ["region", "area", "city"],
        "type": "cities",
    },
    {
        "name": "Cities by region and city",
        "addresses": ["region_cities", "city_name"],
        "standard": ["region", "city"],
        "type": "cities",
    },
    {
        "name": "Cities by region and district-as-city",
        "addresses": ["region_cities", "city_name"],
        "standard": ["region", "area"],
        "type": "cities",
    },    
]

In [20]:
mappings = []
rest = addresses
orig_cols = addresses.columns
for option in merge_options:
    name = option["name"]
    left_cols = option["addresses"]
    right_cols = option["standard"]
    type_ = option["type"]

    to_merge = rest[orig_cols]
    standard = std_c.copy() if type_ == "cities" else std_s.copy()
    standard.drop_duplicates(subset=right_cols, keep=False, inplace=True)
    if len(right_cols) == 2:
        standard.dropna(subset=right_cols, inplace=True)
    standard.rename(columns={"id": "geo_id"}, inplace=True)
    #print(to_merge.columns)
    #print(standard.columns)
    #print(standard.shape)
    
    size_before = len(to_merge)
    merged = to_merge.merge(
        standard,
        how="left",
        left_on=left_cols,
        right_on=right_cols,
        suffixes=("", "_x")
    )
    #print(len(merged))
    
    size_after = len(merged)
    assert size_before == size_after
    
    mapped = merged.loc[merged["geo_id"].notna(), ["id", "geo_id"]]
    mapped["type"] = type_[0]
    if len(mapped) > 0:
        mappings.append(mapped)
        
    rest = merged.loc[merged["geo_id"].isna()]
    
    print(f"Option {name}: found {len(mapped)} matches, {len(rest)} records left")

Option Settlements by all parts with full district name: found 7137 matches, 156763 records left
Option Settlements by all parts with partial district name (no type): found 3033 matches, 153730 records left
Option Settlements by all parts with full city name: found 129 matches, 153601 records left
Option Settlements by all parts with partial city name (no type): found 469 matches, 153132 records left
Option Settlements by all parts except for type with full district name: found 1443 matches, 151689 records left
Option Settlements by all parts except for type with partial district name: found 613 matches, 151076 records left
Option Settlements by all parts except for type with full city name: found 180 matches, 150896 records left
Option Settlements by all parts except for type with partial city name: found 188 matches, 150708 records left
Option Settlements by region and settlement with type: found 2305 matches, 148403 records left
Option Settlements by region and settlement without ty

In [21]:
addr_to_geo = pd.concat(mappings)
addr_to_geo.head(3)

Unnamed: 0,id,geo_id,type
21,21,70137.0,s
22,22,142764.0,s
42,42,9048.0,s


In [22]:
rsmp_with_geo_ids = rsmp.merge(addr_to_geo, how="left")
rsmp_with_geo_ids[["geo_id"]].isna().sum()

geo_id    2620
dtype: int64

In [23]:
geodata_s = settlements[["id", "region", "municipality", "settlement", "type", "oktmo", "longitude_dd", "latitude_dd"]].copy()
geodata_s["geosource_type"] = "s"
geodata_s.rename(columns={
    "id": "geo_id",
    "municipality": "area",
    "type": "settlement_type",
    "longitude_dd": "lon",
    "latitude_dd": "lat",
    "geosource_type": "type",
}, inplace=True)
geodata_s.head()

Unnamed: 0,geo_id,region,area,settlement,settlement_type,oktmo,lon,lat,type
0,0,Орловская область,Болховский,Колонтаева,д,54604420000.0,35.91,53.368611,s
1,1,Республика Крым,Алушта,Пушкино,с,35703000000.0,34.340833,44.595833,s
2,2,Липецкая область,Лев-Толстовский район,Барятино,с,42636410000.0,39.503889,53.262778,s
3,3,Тверская область,Селижаровский район,Хилово,д,28650430000.0,33.419167,56.905556,s
4,4,Томская область,Парабельский район,Басмасово,д,69644440000.0,82.044444,58.636667,s


In [24]:
def join_area_and_type(a, t):
    if pd.isna(a) or pd.isna(t):
        return np.nan
    
    if t == "г":
        return f"Город {a}"
    elif t == "р-н":
        return f"{a} район"
    elif t == "у":
        return f"{a} улус"
    else:
        return a
    
    
geodata_c = cities[["id", "region", "region_type", "area", "area_type", "city", "city_type", "settlement", "settlement_type", "oktmo", "geo_lat", "geo_lon"]].copy()
geodata_c["settlement"] = cities["settlement"].combine_first(cities["city"]).combine_first(cities["area"]).reset_index(drop=True)
geodata_c.loc[geodata_c["area_type"] == "г", "area"] = np.nan
geodata_c["area"] = geodata_c[["area", "area_type"]].apply(lambda x: join_area_and_type(x[0], x[1]), axis=1)

regions_c_s = pd.DataFrame({
    "region": geodata_c["region"].unique(),
    "region_norm": [
        process.extractOne(r, geodata_s["region"].unique(), scorer=fuzz.token_set_ratio)[0]
        for r in geodata_c["region"].unique()]
})

geodata_c = geodata_c.merge(regions_c_s, how="left")
geodata_c["region"] = geodata_c["region_norm"]
geodata_c["geosource_type"] = "c"
geodata_c["settlement_type"] = "г"
geodata_c.rename(columns={
    "id": "geo_id",
    "geo_lat": "lat",
    "geo_lon": "lon",
    "geosource_type": "type",
}, inplace=True)
geodata_c.drop(columns=["region_type", "area_type", "city", "city_type", "region_norm"], inplace=True)
geodata_c.head(3)

Unnamed: 0,geo_id,region,area,settlement,settlement_type,oktmo,lat,lon,type
0,0,Республика Адыгея,,Адыгейск,г,79703000000.0,44.878414,39.190289,c
1,1,Республика Адыгея,,Майкоп,г,79701000000.0,44.609827,40.100661,c
2,2,Республика Алтай,,Горно-Алтайск,г,84701000000.0,51.958103,85.960324,c


In [25]:
geodata = pd.concat((geodata_c, geodata_s))
geodata.shape

(157050, 9)

In [26]:
rsmp = rsmp_with_geo_ids.merge(geodata, how="left", on=["geo_id", "type"])
assert len(rsmp) == addr_count
rsmp.head(3)

Unnamed: 0,kind,category,tin,reg_number,first_name,last_name,patronymic,org_name,org_short_name,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type_x,activity_code_main,start_date,end_date,id,geo_id,type,region,area,settlement,settlement_type_y,oktmo,lat,lon
0,0,0,6910020514,1116910001669,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЦЕНТ...","ООО ""ЦЮУ ""ПРИОРИТЕТ""",69,ТВЕРСКАЯ,ОБЛАСТЬ,,,КИМРЫ,ГОРОД,,,69.1,2020-11-10,2020-11-10,0,949.0,c,Тверская область,,Кимры,г,28726000000.0,56.873321,37.35566
1,1,1,273080245,1100280033897,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,2023-07-10,1,62.0,c,Республика Башкортостан,,Уфа,г,80701000000.0,54.734944,55.957847
2,1,1,276115295,1080276003026,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2023-07-10,2,62.0,c,Республика Башкортостан,,Уфа,г,80701000000.0,54.734944,55.957847


In [27]:
rsmp.isna().sum()

kind                       0
category                   0
tin                        0
reg_number             64764
first_name             80741
last_name              80740
patronymic             81155
org_name               83161
org_short_name         83357
region_code                1
region_name               19
region_type               19
district_name         137368
district_type         137369
city_name              57806
city_type              57806
settlement_name       144288
settlement_type_x     144288
activity_code_main         1
start_date                 0
end_date                   0
id                         0
geo_id                  2620
type                    2620
region                  2620
area                  140084
settlement             43918
settlement_type_y       2620
oktmo                   2966
lat                     2620
lon                     2620
dtype: int64

In [28]:
addresses["address_raw"] = addresses.loc[:, "region_name":"settlement_type"].apply(lambda x: " / ".join(x.fillna("").to_list()), axis=1)

In [29]:
regions_to_settlements["region"] = regions_to_settlements["region"].str.upper()
regions_to_settlements.rename(columns={"region_settlements": "region_norm"}, inplace=True)
addresses = addresses.merge(
    regions_to_settlements,
    how="left",
)
assert len(addresses) == addr_count
addresses.head(2)

Unnamed: 0,id,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,region,district,city,settlement,region_cities,region_settlements,address_raw,region_norm
0,0,ТВЕРСКАЯ,ОБЛАСТЬ,,,КИМРЫ,ГОРОД,,,ТВЕРСКАЯ ОБЛАСТЬ,,ГОРОД КИМРЫ,,ТВЕРСКАЯ,ТВЕРСКАЯ ОБЛАСТЬ,ТВЕРСКАЯ / ОБЛАСТЬ / / / КИМРЫ / ГОРОД / /,Тверская область
1,1,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,РЕСПУБЛИКА БАШКОРТОСТАН,,ГОРОД УФА,,БАШКОРТОСТАН,РЕСПУБЛИКА БАШКОРТОСТАН,БАШКОРТОСТАН / РЕСПУБЛИКА / / / УФА / ГОРОД ...,Республика Башкортостан


In [30]:
rsmp = rsmp.merge(
    addresses[["id", "address_raw", "region_norm"]],
    how="left",
    on="id",
)
assert len(rsmp) == addr_count
rsmp.drop(
    columns=[
        "region_name", "region_type", "district_name", "district_type",
        "city_name", "city_type", "settlement_name", "geo_id",
        "type", "settlement_type_x"],
    inplace=True
)
rsmp.head(2)

Unnamed: 0,kind,category,tin,reg_number,first_name,last_name,patronymic,org_name,org_short_name,region_code,activity_code_main,start_date,end_date,id,region,area,settlement,settlement_type_y,oktmo,lat,lon,address_raw,region_norm
0,0,0,6910020514,1116910001669,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЦЕНТ...","ООО ""ЦЮУ ""ПРИОРИТЕТ""",69,69.1,2020-11-10,2020-11-10,0,Тверская область,,Кимры,г,28726000000.0,56.873321,37.35566,ТВЕРСКАЯ / ОБЛАСТЬ / / / КИМРЫ / ГОРОД / /,Тверская область
1,1,1,273080245,1100280033897,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",2,69.1,2017-04-10,2023-07-10,1,Республика Башкортостан,,Уфа,г,80701000000.0,54.734944,55.957847,БАШКОРТОСТАН / РЕСПУБЛИКА / / / УФА / ГОРОД ...,Республика Башкортостан


In [31]:
rsmp["region"] = rsmp["region"].combine_first(rsmp["region_norm"])

In [32]:
rsmp.rename(columns={"settlement_type_y": "settlement_type"}, inplace=True)

In [33]:
cols_to_check_for_duplicates = [
    "kind",
    "category",
    "tin",
    "reg_number",
    "first_name",
    "last_name",
    "patronymic",
    "org_name",
    "org_short_name",
    "activity_code_main",
    "region",
    "area",
    "settlement",
    "settlement_type",
    "oktmo",
    "lat",
    "lon",
]
duplicates_indices = rsmp.duplicated(
    subset=cols_to_check_for_duplicates,
    keep=False
)
duplicates_cleaned = (
    rsmp.loc[duplicates_indices]
    .sort_values("start_date")
    .groupby(cols_to_check_for_duplicates, dropna=False)
    .agg({"id": "first", "address_raw": "first", "start_date": "first", "end_date": "last"})
    .reset_index()
)
duplicates_cleaned.head(3)

Unnamed: 0,kind,category,tin,reg_number,first_name,last_name,patronymic,org_name,org_short_name,activity_code_main,region,area,settlement,settlement_type,oktmo,lat,lon,id,address_raw,start_date,end_date
0,1,1,323403913,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""КРЕД...","ООО ""КРЕДИТ И ЗАКОН""",69.1,Республика Бурятия,,Улан-Удэ,г,81701000000.0,51.833585,107.584222,81441,БУРЯТИЯ / РЕСПУБЛИКА / / / УЛАН-УДЭ / ГОРОД ...,2018-04-10,2021-06-10
1,1,1,572010870,1150572001348.0,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЮРИД...","ООО ""ЮКК ФИНАНСЫ И ПРАВО М.М.""",69.1,Республика Дагестан,,Махачкала,г,82701000000.0,42.984916,47.504718,126738,ДАГЕСТАН / РЕСПУБЛИКА / / / МАХАЧКАЛА / ГОРО...,2020-08-10,2023-07-10
2,1,1,916011804,1190917003430.0,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ПСК""","ООО ""ПСК""",69.1,Карачаево-Черкесская республика,Усть-Джегутинский район,Усть-Джегута,г,91635100000.0,44.083844,41.971105,75125,КАРАЧАЕВО-ЧЕРКЕССКАЯ / РЕСПУБЛИКА / УСТЬ-ДЖЕГУ...,2020-01-10,2022-06-10


In [34]:
rsmp = pd.concat((rsmp.loc[~duplicates_indices], duplicates_cleaned))
rsmp.shape

(159124, 23)

In [35]:
product_cols = [
    "id",
    "tin",
    "reg_number",
    "kind",
    "category",
    "first_name",
    "last_name",
    "patronymic",
    "org_name",
    "org_short_name",
    "activity_code_main",
    "region",
    "area",
    "settlement",
    "settlement_type",
    "oktmo",
    "lat",
    "lon",
    "address_raw",
    "start_date",
    "end_date",
]
rsmp_product = rsmp[product_cols]
rsmp_product.isna().sum()

id                         0
tin                        0
reg_number             63433
kind                       0
category                   0
first_name             78824
last_name              78823
patronymic             79229
org_name               80302
org_short_name         80490
activity_code_main         1
region                    20
area                  136244
settlement             43668
settlement_type         2469
oktmo                   2814
lat                     2469
lon                     2469
address_raw                0
start_date                 0
end_date                   0
dtype: int64

In [36]:
assert len(rsmp_product) == len(rsmp_product.drop_duplicates(cols_to_check_for_duplicates))

In [37]:
rsmp_product.shape

(159124, 21)

In [38]:
rsmp_product.to_csv("rsmp/csv/data_product.csv", index=False)

In [39]:
rsmp_product.iloc[:1000, ].to_csv("rsmp/csv/data_product_demo.csv", index=False)