# Detect chat languages

A notebook to detect the language of a chat. We use the text of the first 100 messages where either the `text` or the `caption` field are not empty.

Detects only the language chats where the download was started: finished, running and errored.


In [1]:
from dotenv import load_dotenv

import psycopg2

from telegram_data_models import Message, Chat, MessageTextContent, Queue
from telegram_quality_control.db import get_conn_string
from telegram_quality_control.cleaning import clean_text, batch_clean_text

from sqlalchemy import DDL, select, text, func, create_engine
from sqlalchemy.sql import over

from pathlib import Path

import dask.dataframe as dd
from dask.distributed import LocalCluster
import pandas as pd

import numpy as np
import glob
import re
import os
import itertools

from tqdm import tqdm

from fast_langdetect import detect

from dotenv import load_dotenv

load_dotenv()

output_folder = Path(os.environ["OUTPUT_FOLDER"])
scratch_folder = Path(os.environ["SCRATCH_FOLDER"]) / "languages"
output_folder.mkdir(exist_ok=True)
scratch_folder.mkdir(exist_ok=True)

destination_file = output_folder / "language_downloaded_chats.parquet"

num_messages = 100
columns_to_use = ["messages"]

In [2]:
db_url = get_conn_string(".env")

# Dask can't work with ORM models
message_table = Message.__table__
content_table = MessageTextContent.__table__
chat_table = Chat.__table__
queue_table = Queue.__table__

In [7]:
# get chat ids and their download status
chat_status_file = scratch_folder / "chat_status.parquet"
if not (chat_status_file).exists():
    sql = select(queue_table.c.chat_id, queue_table.c.status)
    chat_status = pd.read_sql_query(sql, db_url, index_col="chat_id")
    chat_status = chat_status.sort_index()
    chat_status = chat_status.reset_index(drop=False)
    chat_status.to_parquet(chat_status_file)

chat_status = pd.read_parquet(chat_status_file)
chat_status = chat_status.set_index("chat_id")
chat_status = chat_status[chat_status["status"].isin(["finished", "running", "error"])]
display(chat_status)
chat_ids = list(chat_status.index)
chat_ids.sort()

chat_ids

Unnamed: 0_level_0,status
chat_id,Unnamed: 1_level_1
1,finished
2,finished
3,finished
4,finished
5,finished
...,...
48813064,error
48813093,finished
48813141,error
48813198,finished


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 162,
 163,
 164,
 165,
 170,
 171,
 172,
 173,
 174,
 175,
 177,
 180,
 181,
 185,
 187,
 188,
 189,
 191,
 193,
 194,
 195,
 196,
 198,
 200,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 212,
 213,
 215,
 216,
 217,
 218,
 222,
 226,
 229,
 230,
 231,
 233,
 235,
 236,
 237,
 238,
 240,
 241,
 243,
 244,
 245,
 246,
 247,
 248,
 250,
 252,
 253,
 255,
 257,
 258,
 259,
 260,
 261,
 262,
 263,
 264,
 266,
 267,
 270,
 272,
 275,
 276

In [8]:
def detect_lang(text):
    if text is None or pd.isna(text) or len(text) == 0:
        return pd.Series(dict(lang="no_str", score=1.0))

    try:
        ft = detect(text=text, model='full')[0]
    except Exception as e:
        ft = dict(lang="error", score=1.0)
    return ft


def download_messages(chat_ids, message_limit):
    """Downloads messages for given chat_ids.

    Args:
        chat_ids (list): a list of chat_ids
        message_limit (int): maximum number of messages to download

    Returns:
        message_df: a dataframe with messages
    """
    # a function to enumerate non-consecutive rows
    row_number_column = (
        func.row_number()
        .over(
            partition_by=message_table.c.chat_id,  # Partition by chat_id
            order_by=message_table.c.id,
        )
        .label("row_number")
    )

    subquery = (
        select(
            message_table.c.id,
            message_table.c.chat_id,
            content_table.c.text,
            content_table.c.caption,
            row_number_column,
        )
        .join(content_table, message_table.c.id == content_table.c.message_id)
        .where(message_table.c.chat_id.in_(chat_ids))
        .where(content_table.c.text.isnot(None) | content_table.c.caption.isnot(None))
        .subquery()
    )
    # This selects the first num_messages messages
    sql = select(subquery).where(subquery.c.row_number <= message_limit)

    # get the messages using the chat id as index so that the messages are sorted to partitions by chat_id
    # CHANGED
    message_df = pd.read_sql_query(
        sql,
        db_url,
        # index_col="chat_id",
        index_col="id",
        # engine_kwargs={
        #     "connect_args": {"connect_timeout": 360},
        #     "pool_recycle": 180,
        #     "pool_pre_ping": True,
        # },
    )

    # set the normal index back as index. sort=False prevents the rows from being shuffled around.
    # message_df["chat_id"] = message_df.index
    # message_df = message_df.set_index("id")

    message_df["content"] = message_df["text"].fillna("") + message_df["caption"].fillna("")

    message_df = message_df.drop(columns=["text", "caption", "row_number"])

    return message_df


# Concatenates all text in a pandas series
def concat(series):
    return series.str.cat(sep=". ")


# For every chat_id in message partition, return a long string that contains all concatenated messages, that belong to this chat.
def aggregate_chat_messages(message_partition):
    chat_messages = message_partition.groupby("chat_id")["content"].apply(concat).fillna("")
    return chat_messages


def generate_text_sample(chat_df, columns=["title", "description", "messages"]):
    """Create a column "text_sample" in the dataframe that contains a text sample that will be labeled."""
    chat_df["text_sample"] = ""
    if "title" in columns:
        chat_df["text_sample"] += chat_df["title"].fillna("") + ". "
    if "description" in columns:
        chat_df["text_sample"] += chat_df["description"].fillna("") + ". "

    if "messages" in columns:
        chat_ids = list(chat_df.index)
        message_df = download_messages(chat_ids, num_messages)
        chat_texts = aggregate_chat_messages(message_df)
        chat_texts.name = "messages"
        chat_df = pd.merge(chat_df, chat_texts, how='left', left_index=True, right_index=True)
        chat_df['text_sample'] += chat_df["messages"].fillna('')
        chat_df = chat_df.drop('messages', axis=1)

    # clean the sample
    chat_df["text_sample"] = batch_clean_text(chat_df["text_sample"])

    return chat_df

In [9]:
chat_df = pd.DataFrame(
    {
        "id": [1, 2, 6, 130, 3, 758],
        "title": ["first", "second", "sixth", "130", "three", "queued"],
        "description": [np.nan, "uiae", "uiaen", "nrtddrtn", np.nan, np.nan],
    }
)
chat_df.set_index("id", inplace=True)

chat_df = generate_text_sample(chat_df, columns=["messages"])
chat_df

: 

: 

In [None]:
print(detect("Here is some text in a foreign language"))
print(detect("А вот текст на другом языке."))
print(detect("以下是一些中文文本"))

[{'lang': 'en', 'score': 0.946763813495636}]
[{'lang': 'ru', 'score': 0.8967373967170715}]
[{'lang': 'zh', 'score': 1.0}]


## Parallelized part


There are two levels of subdivision: chunks and crumbs. Chunks are composed of crumbs and crumbs correspond to dask partitions. Everything is divided in terms of chat_ids.


In [None]:
chunk_size = 2000
crumb_size = 20
num_workers = 4

In [None]:
cluster = LocalCluster(
    n_workers=num_workers,
    threads_per_worker=1,
    memory_limit="4GB",
    death_timeout=100,
)
client = cluster.get_client()
client.dashboard_link
# cluster.scale(32)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45937 instead


'http://127.0.0.1:45937/status'

In [None]:
chat_lang = pd.DataFrame(
    {
        "chat_id": pd.Series(dtype='int64'),
        "lang": pd.Series(dtype=str),
        "score": pd.Series(dtype='float64'),
        "status": pd.Series(dtype=str),
        "columns_used": pd.Series(dtype=str),
    },
)

if destination_file.exists():
    chat_lang = pd.read_parquet(destination_file)

# Save already computed chunks from scratch to the all_chat_file
# find all files that match the pattern chunk_{i}_{j}.parquet in the folder:
files = glob.glob(str(scratch_folder / "chunk_*-*.parquet"))
for file in files:
    chat_lang_chunk = pd.read_parquet(file)
    chat_lang = pd.concat([chat_lang, chat_lang_chunk])
    # if input(f"Delete file {file}?") == "y":
    os.remove(file)

# chat_lang = chat_lang[~chat_lang.index.duplicated(keep='first')]
chat_lang.to_parquet(destination_file)

finished_ids = set(chat_lang['chat_id'])
chat_ids = [id for id in chat_ids if id not in finished_ids]

num_chunks = len(chat_ids) // chunk_size + 1

print(f"{len(chat_lang)} chats processed, {len(chat_ids)} chats remaining")
display(chat_lang)
print(chat_ids[:20])

1624250 chats processed, 0 chats remaining


Unnamed: 0,chat_id,lang,score,status,columns_used
0,1,en,0.875085,finished,messages
1,2,ru,0.987296,finished,messages
2,3,en,0.880803,finished,messages
3,4,ru,0.995698,finished,messages
4,5,ru,0.996825,finished,messages
...,...,...,...,...,...
1995,11311486,no_str,1.000000,error,messages
1996,11311689,no_str,1.000000,error,messages
1997,11311705,no_str,1.000000,error,messages
1998,11311761,no_str,1.000000,error,messages


[]


In [None]:
# iteration over chunks:

pbar = tqdm(total=num_chunks)

for i, chat_id_chunk in enumerate(itertools.batched(chat_ids, chunk_size)):
    chat_id_chunk = list(chat_id_chunk)
    first_chat_id = chat_id_chunk[0]
    last_chat_id = chat_id_chunk[-1]

    chunk_file = scratch_folder / f"chunk_{first_chat_id}-{last_chat_id}.parquet"

    chat_df = dd.from_pandas(chat_status.loc[chat_id_chunk], chunksize=crumb_size)

    chat_df = chat_df.map_partitions(lambda df: generate_text_sample(df, columns=columns_to_use))

    chat_lang = chat_df.apply(
        lambda row: detect_lang(row["text_sample"]),
        axis=1,
        result_type='expand',
        meta={'lang': 'object', 'score': 'float64'},
    )

    chat_lang["status"] = chat_df["status"]
    chat_lang["columns_used"] = ", ".join(columns_to_use)
    chat_lang = chat_lang.compute()

    chat_lang["chat_id"] = chat_lang.index
    chat_lang = chat_lang.reset_index(drop=True)

    chat_lang.to_parquet(chunk_file)

    pbar.update(1)

 22%|█████████████████▌                                                               | 176/812 [1:51:45<4:39:14, 26.34s/it]

### Write the languages into the database


In [3]:
chat_lang = pd.read_parquet(destination_file)
chat_lang = chat_lang.drop(columns=["status"])
chat_lang

Unnamed: 0,chat_id,lang,score,columns_used
0,1,en,0.875085,messages
1,2,ru,0.987296,messages
2,3,en,0.880803,messages
3,4,ru,0.995698,messages
4,5,ru,0.996825,messages
...,...,...,...,...
1995,11311486,no_str,1.000000,messages
1996,11311689,no_str,1.000000,messages
1997,11311705,no_str,1.000000,messages
1998,11311761,no_str,1.000000,messages


In [None]:
from telegram_quality_control.chat_language import ChatLanguage


engine = create_engine(db_url)
print("created engine")

created engine


In [None]:
ChatLanguage.__table__.create(engine)

with engine.connect() as conn:
    conn.execute(text("GRANT SELECT ON chat_language TO data_analyst"))
    conn.execute(
        text(
            "GRANT SELECT, TRIGGER, REFERENCES, TRUNCATE, DELETE, UPDATE, SELECT, INSERT ON chat_language TO nagios"
        )
    )
    conn.execute(
        text(
            "GRANT SELECT, TRIGGER, REFERENCES, TRUNCATE, DELETE, UPDATE, SELECT, INSERT ON chat_language TO crawler"
        )
    )

In [6]:
chat_lang.to_sql('chat_language', engine, if_exists='append', index=False)

250