In [14]:
import pandas as pd
import geopandas as gpd
import networkx as nx
from pathlib import Path
from sloyka import Geocoder,TextClassifiers, City_services, EmotionRecognizer
from typing_extensions import  Union, Optional

In [9]:
POSITIVE = ["happiness", "enthusiasm"]
NEGATIVE = ["sadness", "anger", "fear", "disgust"]

In [2]:
df = pd.read_feather(r"C:\Projects\ITMO\sloyka\sloyka\sample_data\regional_activity.feather")

In [3]:
df

Unnamed: 0,toponym,service,Part_users,Part_messages,positive,negative,neutral,interpretation
0,Волгоградская улица,Озеро,1.0,0.006,0.0,0.0,1.0,neutral
1,Волгоградская улица,Автовокзал,1.0,0.006,0.0,0.0,1.0,neutral
2,улица Дзержинского,Кинотеатр,1.0,0.004,0.0,0.0,1.0,neutral
3,улица Победы,Магазин,1.0,0.003,0.0,0.0,1.0,neutral
4,Волжская улица,Памятник,1.0,0.003,0.0,0.0,1.0,neutral


In [11]:
data = pd.read_parquet(r"C:\Projects\ITMO\sloyka\sloyka\src\processed_geodata.parquet")

In [12]:
len(data)

63001

In [10]:
def get_chain_ids(
        name: str,
        data: Union[pd.DataFrame, gpd.GeoDataFrame],
        id_column: str,
        name_column: str,
    ) -> list:
    """This function creates a tuple of unique identifiers of chains of posts, comments and
        replies around specified value in column.

        Args:
            name (str): value in column to select a post-comment-reply chains
            data (Union[pd.DataFrame, gpd.GeoDataFrame]): data with posts, comments and replies
            id_column (str): column with unique identifiers
            name_column (str): column to base a selection

        Returns:
            tuple: tuple of unique identifiers of chains
    """

    posts_ids = data[id_column].loc[data[name_column] == name].to_list()  # type: ignore
    comments_ids = (
            data[id_column]
            .loc[data["post_id"].isin(posts_ids) & data[name_column].isin([name, None])]
            .to_list()
        )  # type: ignore
    replies_ids = (
            data[id_column]
            .loc[
                data["parents_stack"].isin(comments_ids)
                & data[name_column].isin([name, None])
            ]
            .to_list()
        )  # type: ignore

    return tuple(sorted(list(set(posts_ids + comments_ids + replies_ids))))  # type: ignore

def get_service_counts(
        data: pd.DataFrame, services: list, service_column: str = "City_services"
    ) -> pd.DataFrame:
    """
        Calculate the counts of each service in the given DataFrame.

        Args:
            data (pd.DataFrame): The DataFrame containing the data.
            services (list): The list of services to count.
            service_column (str, optional): The name of the column containing the services. Defaults to 'City_services'.

        Returns:
            pd.DataFrame: A DataFrame with two columns: 'service' and 'counts', where 'service' is the name 
            of each service and 'counts' is the number of occurrences of each service in the given DataFrame.
    """

    columns = ["service", "counts"]
    res = []
    counts = 0
    for service in services:
        counts = 0
        for index_num in range(len(data)):
            if service in data[service_column].iloc[index_num]:
                counts += 1

        res.append([service, counts])

    return pd.DataFrame(res, columns=columns)

def get_service_ids(
        data: pd.DataFrame,
        service: str,
        service_column: str = "City_services",
        id_column: str = "id",
    ) -> list:
    """
        Get the IDs of the rows in the given DataFrame that contain the specified service.

        Args:
            data (pd.DataFrame): The DataFrame containing the data.
            service (str): The service to search for.
            service_column (str, optional): The name of the column containing the services. Defaults to 'City_services'.
            id_column (str, optional): The name of the column containing the IDs. Defaults to 'id'.

        Returns:
            list: A list of IDs corresponding to the rows that contain the specified service.
    """

    res = []

    for index_num in range(len(data)):
        if service in data[service_column].iloc[index_num]:
            res.append(data[id_column].iloc[index_num])

    return res

In [31]:
def get_risks(
        processed_data: Optional[gpd.GeoDataFrame] = None,
        top_n: int = 5,
    ) -> pd.DataFrame:
    """This function returns a pd.DataFrame with info about social risks based on provided texts.

        Args:
            top_n (int, optional): The number of most mentioned toponyms to be calculated. Defaults to 5.
            to_df (bool, optional): Whether to return a DataFrame or a dictionary. Defaults to False.

        Returns:
            pd.DataFrame: Table with info about users altitude to the service in toponyms.
    """

    gdf_final = processed_data
    top_n_toponyms = (
            processed_data["only_full_street_name"]
            .value_counts(normalize=True)
            .index[:top_n]
        )

    columns = [
            "toponym",
            "service",
            "Part_users",
            "Part_messages",
            "positive",
            "negative",
            "neutral",
            "interpretation",
            "geometry"
        ]
    risks = []

    for i in top_n_toponyms:
        all_ids = get_chain_ids(
                name=i,
                data=gdf_final,
                id_column="id",
                name_column="only_full_street_name",
            )

        toponym_gdf_final = gdf_final.loc[gdf_final["id"].isin(all_ids)]
        
        geom = toponym_gdf_final['geometry'].dropna().iloc[0]
        
        part_users = len(toponym_gdf_final["from_id"].unique()) / len(
                gdf_final["from_id"].unique()
            )
        part_messages = len(toponym_gdf_final["id"].unique()) / len(
                gdf_final["id"].unique()
            )

        services = toponym_gdf_final["City_services"].apply(lambda x: list(set(x)))
        services = list(set([obj for inner_list in services for obj in inner_list]))
        services_rating = (
                get_service_counts(data=toponym_gdf_final, services=services)
                .sort_values(by="counts", ascending=False)[:top_n]["service"]
                .to_list()
            )

        if services:
            for service in services_rating:
                service_ids = get_service_ids(
                        data=toponym_gdf_final, service=service
                    )

                users_neg = 0
                users_pos = 0
                users_neu = 0

                service_gdf = toponym_gdf_final.loc[
                        toponym_gdf_final["id"].isin(service_ids)
                    ]
                users_id = service_gdf["from_id"].unique()

                for user in users_id:
                    
                    pos = 0
                    neg = 0
                    
                    user_gdf = service_gdf.loc[service_gdf["from_id"] == user]
                    grouped = user_gdf.groupby("emotion_average")[
                            "group_name"
                        ].count()

                    neg, pos = count_emotions(grouped)

                    if pos > neg:
                        users_pos += 1
                    elif pos < neg:
                        users_neg += 1
                    else:
                        users_neu += 1

                positive_coef = users_pos / len(users_id)
                negative_coef = users_neg / len(users_id)
                neutral_coef = users_neu / len(users_id)

                interpretation = interpretate_coef(negative_coef,
                                                   positive_coef,
                                                   neutral_coef)

                risks.append(
                        [
                            i,
                            service,
                            part_users,
                            part_messages,
                            positive_coef,
                            negative_coef,
                            neutral_coef,
                            interpretation,
                            geom
                        ]
                    )

    risks_df = pd.DataFrame(risks, columns=columns)

    return risks_df
    
def interpretate_coef(negative_coef: float, positive_coef: float, neutral_coef: float) -> str:
    if negative_coef > positive_coef:
        interpretation = "reorganize"
    elif negative_coef < positive_coef:
        interpretation = "keep"
    elif (
        negative_coef == positive_coef
        and negative_coef != 0
        and neutral_coef != 0
        ):
        interpretation = "controversial"
    else:
        interpretation = "neutral" 

    return interpretation

def count_emotions(grouped):
    
    pos = 0
    neg = 0
    
    for emotion in grouped.index:
        if emotion in POSITIVE:
            if emotion in grouped.index:
                pos += grouped[emotion]
            else:
                continue
        elif emotion in NEGATIVE:
            if emotion in grouped.index:
                neg += grouped[emotion]
            else:
                continue
            
    return neg, pos

get_risks(data)

Unnamed: 0,toponym,service,Part_users,Part_messages,positive,negative,neutral,interpretation,geometry
0,улица Ленина,Магазин,0.014246,0.004647,0.2,0.0,0.8,keep,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
1,улица Ленина,Фонтан,0.014246,0.004647,0.0,0.0,1.0,neutral,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
2,улица Ленина,Остановка,0.014246,0.004647,0.333333,0.0,0.666667,keep,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
3,улица Ленина,Памятник,0.014246,0.004647,0.25,0.0,0.75,keep,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
4,улица Ленина,Университет,0.014246,0.004647,0.0,0.0,1.0,neutral,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
5,проспект Ленина,Магазин,0.014246,0.004647,0.2,0.0,0.8,keep,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
6,проспект Ленина,Фонтан,0.014246,0.004647,0.0,0.0,1.0,neutral,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
7,проспект Ленина,Остановка,0.014246,0.004647,0.333333,0.0,0.666667,keep,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
8,проспект Ленина,Памятник,0.014246,0.004647,0.25,0.0,0.75,keep,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
9,проспект Ленина,Университет,0.014246,0.004647,0.0,0.0,1.0,neutral,b'\x01\x01\x00\x00\x00\xa2\xfd\xfe\x83\xfe<F@/...
