In [12]:
import pandas as pd
from textblob import TextBlob

# Language Detection

In [13]:
# !pip install langdetect
# !pip install langid

In [25]:
ptbxl_103 = pd.read_csv('../data_ptb-xl/train.csv')
ptbxl_101 = pd.read_csv('../data_ptb-xl_1.0.1/en_df_round4.csv')

In [26]:
len(ptbxl_103) , len(ptbxl_101)

(13095, 16272)

In [29]:
ptbxl_101.head()

Unnamed: 0,ecg_id,report
0,1,sinusrhythm peripheral low voltage
1,2,sinus bradycardia otherwise normal ekg
2,3,sinus rhythm normal ekg
3,4,sinus rhythm normal ekg
4,5,sinus rhythm normal ekg


## TextBlob
https://towardsdatascience.com/4-python-libraries-to-detect-english-and-non-english-language-c82ad3efd430

In [15]:
lang = TextBlob(ptbxl_103['preprocessed_report'][0])
language = lang.detect_language()

AttributeError: 'TextBlob' object has no attribute 'detect_language'

## langdetect

In [38]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language_langdetect(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [35]:
ptbxl_103['langdetect'] = ptbxl_103['preprocessed_report'].apply(detect_language_langdetect)
ptbxl_101['langdetect'] = ptbxl_101['report'].astype(str).apply(detect_language_langdetect)

In [39]:
ptbxl_101_counts = ptbxl_101['langdetect'].value_counts().reset_index()
ptbxl_103_counts = ptbxl_103['langdetect'].value_counts().reset_index()

ptbxl_101_counts.columns = ['langdetect', 'ptbxl_101_count']
ptbxl_103_counts.columns = ['langdetect', 'ptbxl_103_count']

merged_counts = pd.merge(ptbxl_101_counts, ptbxl_103_counts, on='langdetect', how='outer')

merged_counts.fillna(0, inplace=True)

merged_counts['ptbxl_101_count'] = merged_counts['ptbxl_101_count'].astype(int)
merged_counts['ptbxl_103_count'] = merged_counts['ptbxl_103_count'].astype(int)

print(merged_counts)

   langdetect  ptbxl_101_count  ptbxl_103_count
0          en            14004            11665
1          cy             1415              970
2          et              188                7
3          sq              155                2
4          no              146               18
5          sv               87               33
6          de               85               15
7          da               77               19
8          ro               34               55
9          ca               23               20
10         lt               18                4
11         fi               10               15
12         it               10               23
13         af                9                0
14         fr                6               17
15         tl                2                0
16         hr                2                0
17         nl                1               18
18         es                0              209
19         cs                0          

## langid

In [42]:
import langid

def detect_language_langid(text):
    try:
        language, _ = langid.classify(text)
        return language
    except LangDetectException:
        return "unknown"

In [43]:
ptbxl_103['langid'] = ptbxl_103['preprocessed_report'].apply(detect_language_langid)
ptbxl_101['langid'] = ptbxl_101['report'].astype(str).apply(detect_language_langid)

In [45]:
ptbxl_101_counts = ptbxl_101['langid'].value_counts().reset_index()
ptbxl_103_counts = ptbxl_103['langid'].value_counts().reset_index()

ptbxl_101_counts.columns = ['langid', 'ptbxl_101_count']
ptbxl_103_counts.columns = ['langid', 'ptbxl_103_count']

merged_counts = pd.merge(ptbxl_101_counts, ptbxl_103_counts, on='langid', how='outer')

merged_counts.fillna(0, inplace=True)

merged_counts['ptbxl_101_count'] = merged_counts['ptbxl_101_count'].astype(int)
merged_counts['ptbxl_103_count'] = merged_counts['ptbxl_103_count'].astype(int)

print(merged_counts)

   langid  ptbxl_101_count  ptbxl_103_count
0      en            12381            10337
1      lt             2670             1581
2      es              342               64
3      de              280               37
4      fr              235              515
5      sv               76               13
6      it               62               34
7      ca               51               18
8      no               44               94
9      cy               35               22
10     pt               14                0
11     sk               11               11
12     af               10                5
13     nn               10                2
14     da                9              158
15     la                8               22
16     et                7               42
17     sl                7               28
18     hr                4                0
19     nl                2               21
20     oc                2               19
21     mt                2      

## Compare Language Detection

In [46]:
ptbxl_103['comparison'] = ptbxl_103.apply(lambda row: 'same' if row['langid'] == row['langdetect'] else f"{row['langid']}-{row['langdetect']}", axis=1)
ptbxl_103['comparison'].value_counts()

comparison
same     10081
lt-cy      909
lt-en      671
fr-en      431
en-es      209
         ...  
oc-ro        1
ro-it        1
fr-sk        1
es-it        1
la-sv        1
Name: count, Length: 107, dtype: int64

In [47]:
ptbxl_101['comparison'] = ptbxl_101.apply(lambda row: 'same' if row['langid'] == row['langdetect'] else f"{row['langid']}-{row['langdetect']}", axis=1)
ptbxl_101['comparison'].value_counts()

comparison
same     12277
lt-cy     1274
lt-en     1084
es-en      236
fr-en      208
         ...  
de-fi        1
es-fr        1
hr-et        1
fo-sv        1
cy-fi        1
Name: count, Length: 95, dtype: int64

todo:
https://aws.amazon.com/blogs/industries/how-to-process-medical-text-in-multiple-languages-using-amazon-translate-and-amazon-comprehend-medical/
https://arxiv.org/pdf/2311.16588.pdf