In [3]:
import pandas as pd
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Language Detection

In [13]:
# !pip install langdetect
# !pip install langid

In [4]:
ptbxl_103 = pd.read_csv('../data_ptb-xl/train.csv')
ptbxl_101 = pd.read_csv('../data_ptb-xl_1.0.1/en_df_round4.csv')

In [5]:
len(ptbxl_103) , len(ptbxl_101)

(13095, 16272)

In [6]:
ptbxl_101.head()

Unnamed: 0,ecg_id,report
0,1,sinusrhythm peripheral low voltage
1,2,sinus bradycardia otherwise normal ekg
2,3,sinus rhythm normal ekg
3,4,sinus rhythm normal ekg
4,5,sinus rhythm normal ekg


## langdetect

In [7]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language_langdetect(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [8]:
ptbxl_103['langdetect'] = ptbxl_103['preprocessed_report'].apply(detect_language_langdetect)
ptbxl_101['langdetect'] = ptbxl_101['report'].astype(str).apply(detect_language_langdetect)

In [22]:
ptbxl_101['langdetect_modified'] = ptbxl_101['langdetect'].apply(lambda x: 'en' if x == 'en' else 'non-en')
ptbxl_103['langdetect_modified'] = ptbxl_103['langdetect'].apply(lambda x: 'en' if x == 'en' else 'non-en')


ptbxl_101_counts = ptbxl_101['langdetect_modified'].value_counts().reset_index()
ptbxl_103_counts = ptbxl_103['langdetect_modified'].value_counts().reset_index()

ptbxl_101_counts.columns = ['langdetect', 'ptbxl_101_count']
ptbxl_103_counts.columns = ['langdetect', 'ptbxl_103_count']

merged_counts = pd.merge(ptbxl_101_counts, ptbxl_103_counts, on='langdetect', how='outer')

merged_counts.fillna(0, inplace=True)

merged_counts['ptbxl_101_count'] = merged_counts['ptbxl_101_count'].astype(int)
merged_counts['ptbxl_103_count'] = merged_counts['ptbxl_103_count'].astype(int)

print(merged_counts)

  langdetect  ptbxl_101_count  ptbxl_103_count
0         en            14003            11654
1     non-en             2269             1441


## langid

In [10]:
import langid

def detect_language_langid(text):
    try:
        language, _ = langid.classify(text)
        return language
    except LangDetectException:
        return "unknown"

In [11]:
ptbxl_103['langid'] = ptbxl_103['preprocessed_report'].apply(detect_language_langid)
ptbxl_101['langid'] = ptbxl_101['report'].astype(str).apply(detect_language_langid)

In [23]:
ptbxl_101['langid_modified'] = ptbxl_101['langid'].apply(lambda x: 'en' if x == 'en' else 'non-en')
ptbxl_103['langid_modified'] = ptbxl_103['langid'].apply(lambda x: 'en' if x == 'en' else 'non-en')

ptbxl_101_counts = ptbxl_101['langid_modified'].value_counts().reset_index()
ptbxl_103_counts = ptbxl_103['langid_modified'].value_counts().reset_index()

ptbxl_101_counts.columns = ['langid', 'ptbxl_101_count']
ptbxl_103_counts.columns = ['langid', 'ptbxl_103_count']

merged_counts = pd.merge(ptbxl_101_counts, ptbxl_103_counts, on='langid', how='outer')

merged_counts.fillna(0, inplace=True)

merged_counts['ptbxl_101_count'] = merged_counts['ptbxl_101_count'].astype(int)
merged_counts['ptbxl_103_count'] = merged_counts['ptbxl_103_count'].astype(int)

print(merged_counts)

   langid  ptbxl_101_count  ptbxl_103_count
0      en            12381            10337
1  non-en             3891             2758


## Compare Language Detection

In [24]:
ptbxl_103['comparison'] = ptbxl_103.apply(lambda row: 'same' if row['langid'] == row['langdetect'] else f"{row['langid']}-{row['langdetect']}", axis=1)
ptbxl_103['comparison'].value_counts()

comparison
same     10074
lt-cy      920
lt-en      660
fr-en      433
en-es      211
         ...  
ro-it        1
da-ca        1
af-en        1
fr-sv        1
la-sv        1
Name: count, Length: 101, dtype: int64

In [25]:
ptbxl_101['comparison'] = ptbxl_101.apply(lambda row: 'same' if row['langid'] == row['langdetect'] else f"{row['langid']}-{row['langdetect']}", axis=1)
ptbxl_101['comparison'].value_counts()

comparison
same     12258
lt-cy     1281
lt-en     1091
es-en      232
fr-en      208
         ...  
de-fi        1
fo-sv        1
nn-ca        1
cs-en        1
sv-it        1
Name: count, Length: 95, dtype: int64

## Look at non-english words

In [35]:
df = ptbxl_101.loc[ptbxl_101['langdetect'] == 'en']

df['report'].value_counts()

report
sinusrhythm position type normal normal ecg                                                                                                                                                               1195
sinusrhythm location type normal normal ekg 4.46 unconfirmed report                                                                                                                                       1036
sinus rhythm. normal ecg.                                                                                                                                                                                  628
sinus rhythm normal ekg                                                                                                                                                                                    253
sinusrhythm left type otherwise normal ekg 4.46 unconfirmed report                                                                                                   

In [36]:
df = ptbxl_103.loc[ptbxl_103['langdetect'] != 'en']

df['preprocessed_report'].value_counts()

preprocessed_report
sinus rhythm normal ecg                                                                                                                                                                     910
trace requested                                                                                                                                                                             207
sinus rhythm definite pathology                                                                                                                                                              52
pacemaker ecg                                                                                                                                                                                19
sinusrhythmus ueberdrehter linkstyp                                                                                                                                                          14
                    

In [37]:
df = ptbxl_101.loc[ptbxl_101['langid'] != 'en']

df['report'].value_counts()

report
sinus rhythm. normal ecg.                                                                                                                           1732
sinus rhythm normal ekg                                                                                                                              646
sinus rhythm. no definite pathology.                                                                                                                 217
sinus rhythm normal ecg                                                                                                                               59
sinus arrhythmia. normal ecg.                                                                                                                         52
                                                                                                                                                    ... 
sinusrytm inget secures pathologisct                                       

In [39]:
df = ptbxl_103.loc[ptbxl_103['langid'] == 'en']

df['preprocessed_report'].value_counts()

preprocessed_report
sinus rhythm position type normal normal ecg                                                                                                                                                           773
sinus rhythm position type normal normal ecg 4 46 unconfirmed report                                                                                                                                   637
trace requested                                                                                                                                                                                        207
sinus rhythm left type otherwise normal ecg 4 46 unconfirmed report                                                                                                                                    149
sinus rhythm left type otherwise normal ecg                                                                                                                             

todo:
https://aws.amazon.com/blogs/industries/how-to-process-medical-text-in-multiple-languages-using-amazon-translate-and-amazon-comprehend-medical/
https://arxiv.org/pdf/2311.16588.pdf