In [1]:
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
tqdm.pandas()

REP_ID = "Sandrro/cc-model"

def category_predictor(text, classifier):
    pred = pd.DataFrame(classifier(text, top_k=3))
    classifier.call_count = 0
    cats = ', '.join(pred['label'].tolist())
    probs = ', '.join(pred['score'].round(3).astype(str).tolist())
    return [cats, probs]

def classify_text(path_to_excel):
    classifier = pipeline("text-classification", model=REP_ID, tokenizer=REP_ID, max_length=2048, truncation=True, device=0)
    df_predict = pd.read_excel(path_to_excel)
    df_predict[['cats','probs']] = pd.DataFrame(df_predict['Текст'].progress_map(lambda x: category_predictor(x, classifier)).to_list())
    df_predict['category'] = df_predict.cats.map(lambda x: x.split(', ')[0])
    df_predict['probability'] = df_predict.probs.map(lambda x: float(x.split(', ')[0]))

    return df_predict

In [2]:
import geopandas as gpd
import numpy as np

In [3]:
df_predict = gpd.read_file(path_to_file)

In [4]:
df_predict = df_predict.set_crs(32636, allow_override=True).to_crs(4326)

In [5]:
df_predict.rename(columns={'Текст комментария':'Текст'}, inplace=True)

In [6]:
classifier = pipeline("text-classification", model=REP_ID, tokenizer='cointegrated/rubert-tiny2', max_length=2048, truncation=True, device=0)
df_predict[['cats','probs']] = pd.DataFrame(df_predict['Текст'].progress_map(lambda x: category_predictor(x, classifier)).to_list())
df_predict['category'] = df_predict.cats.map(lambda x: x.split(', ')[0])
df_predict['probability'] = df_predict.probs.map(lambda x: float(x.split(', ')[0]))

100%|██████████| 2500/2500 [00:31<00:00, 80.01it/s] 


In [7]:
import requests
import osm2geojson
OVERPASS_URL = "https://overpass-api.de/api/interpreter"
city_name = 'Санкт-Петербург'
city_admin_level = 5
buffer_size = 5
overpass_query = f"""
                            [out:json];
                                    area['name'='{city_name}']->.searchArea;
                                    (
                                    relation["admin_level"="{city_admin_level}"](area.searchArea);
                                    );
                            out geom;
                            """
result = requests.get(OVERPASS_URL, params={"data": overpass_query}, timeout=600).json()  # pylint: disable=missing-timeout
resp = osm2geojson.json2geojson(result)
if resp["features"]:
    entity_geometry = gpd.GeoDataFrame.from_features(resp["features"]).set_crs(4326).to_crs(32636)
    entity_geometry = entity_geometry[["id", "geometry"]]
    # Output geometry in any case must be some king of Polygon, so it could be extracted from city's geometry
    entity_geometry = entity_geometry.loc[entity_geometry["geometry"].geom_type.isin([
        "Polygon", "MultiPolygon", "LineString", "MultiLineString"])]
    # Buffer geometry in case of line-kind objects like waterways, roads or railways
    if buffer_size:
        entity_geometry["geometry"] = entity_geometry["geometry"].buffer(buffer_size)

city_geometry = entity_geometry
city_geometry = city_geometry.dissolve()

In [8]:
import osmnx as ox
roads_geometry = ox.graph_from_polygon(city_geometry.to_crs(4326).geometry.item(), network_type="drive")
roads_geometry = ox.utils_graph.graph_to_gdfs(roads_geometry, nodes=False)
roads_geometry = (roads_geometry.reset_index(level=[0, 1]).reset_index(drop=True).to_crs(32636))
roads_geometry = roads_geometry[["geometry"]]
roads_geometry_buffer = roads_geometry.copy()
roads_geometry_buffer["geometry"] = roads_geometry_buffer["geometry"].buffer(5).to_crs(4326)

In [9]:
street_geom = roads_geometry_buffer.sjoin(df_predict)
street_geom = street_geom.join(roads_geometry['geometry'].to_crs(4326), rsuffix = '_line')
street_geom.index = street_geom.index_right

In [10]:
df_predict = pd.DataFrame(df_predict)
df_predict = df_predict.join(street_geom['geometry_line'])

In [11]:
import json
import shapely

In [12]:
df_predict = df_predict.reset_index(drop=True).reset_index()

In [13]:
df_predict['geometry'] = df_predict['geometry'].map(lambda x: json.dumps(shapely.geometry.mapping(x)))

In [14]:
df_predict = df_predict.drop(columns='geometry_line').join(gpd.GeoDataFrame(df_predict[df_predict['geometry_line'].notna()], geometry='geometry_line')['geometry_line']
    .map(lambda x: json.dumps(shapely.geometry.mapping(x))))

In [15]:
df_predict.loc[df_predict['geometry_line'].notna(), 'geometry'] = df_predict['geometry_line']

In [16]:
df_predict.loc[df_predict['address'].isna(), 'geometry'] = None

In [17]:
df_predict['cats'] = df_predict['cats'].str.split(', ')

In [18]:
df_predict['probs'] = df_predict['probs'].str.split(', ')

In [19]:
df_predict = df_predict[['index', 'Дата и время', 'Текст', 'address', 'cats', 'probs', 'geometry']]

In [20]:
df_predict.rename(columns={'index':'id', 'Дата и время':'created_at', 'Текст':'text', 'address':'street_name', 'cats':'name', 'probs':'probability'}, inplace=True)

In [21]:
df_predict = df_predict.copy(deep=True)

In [22]:
rec_locs = df_predict.drop_duplicates(subset=['id', 'created_at', 'text'])[['street_name', 'geometry']]
rec_locs['probability'] = 0.7

In [23]:
df_predict = df_predict.explode(column=['name', 'probability'])

In [25]:
df_predict['probability'] = df_predict['probability'].astype(float)

In [26]:
df_predict = df_predict.groupby(['id', 'created_at', 'text']).apply(lambda x: x[['name', 'probability']].to_dict(orient='records')).reset_index().rename(columns={0:'recognition_blocks'})

In [27]:
df_predict['recognition_themes'] = [[] for x in range(len(df_predict))]

In [28]:
df_predict['recognition_locations'] = [[x] for x in rec_locs.to_dict(orient='records')]

In [29]:
df_predict['approved_block'] = 'block'
df_predict['approved_theme'] = 'theme'
df_predict['approved_location'] = None

In [31]:
df_predict.drop_duplicates(subset=['text', 'created_at'], inplace=True)

In [32]:
import json

In [34]:
df_predict.to_json('recognition_example.json', orient='records')

In [35]:
json.loads(df_predict.to_json(orient='records'))

[{'id': 0,
  'created_at': '2022.12.13 14:34',
  'text': '[club143265175|Центральный район Санкт-Петербурга], невский 173-175',
  'recognition_blocks': [{'name': 'Не ЦУР', 'probability': 0.357},
   {'name': 'Благоустройство', 'probability': 0.234},
   {'name': 'БОС', 'probability': 0.215}],
  'recognition_themes': [],
  'recognition_locations': [{'street_name': 'невский 173-175',
    'geometry': '{"type": "Point", "coordinates": [30.382917999999997, 59.92417199999997]}',
    'probability': 0.7}],
  'approved_block': 'block',
  'approved_theme': 'theme',
  'approved_location': None},
 {'id': 1,
  'created_at': '2019.09.27 10:27',
  'text': 'Невский 184 отопления НЕТ Аварий никаких нет 😕',
  'recognition_blocks': [{'name': 'ЖКХ', 'probability': 0.703},
   {'name': 'Энергетика', 'probability': 0.269},
   {'name': 'Дороги', 'probability': 0.008}],
  'recognition_themes': [],
  'recognition_locations': [{'street_name': 'Невский 184',
    'geometry': '{"type": "Point", "coordinates": [30.384