In [None]:
########################################################################################################################################
# example texts for machine translation into English:
# https://docs.google.com/spreadsheets/d/linktoGsheet
# exceptional language codes:
# "ZZ": unknown language but *very* likely *not* English
# "??": unknown language but likely to be English

#############################################
# Resources:
# Machine translation (with Hugging Face pipelines)
# https://huggingface.co/docs/transformers/tasks/translation
#
# Language identification (with fastText)
# https://huggingface.co/facebook/fasttext-language-identification

# Load packages

In [None]:
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install pandas --quiet
!pip install langcodes  --quiet
!pip install langdetect --quiet
!pip install nltk --quiet
!pip install fasttext-langdetect --quiet

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd
import time
from nltk.tokenize import sent_tokenize

#####################################
# connecting with Google Drive
import gspread
from google.colab import auth
auth.authenticate_user()
from google.auth import default

creds, _ = default()
gcolab = gspread.authorize(creds)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [None]:
#####################################
# load and clean data
gsheets = gcolab.open_by_url('https://docs.google.com/spreadsheets/d/linktoGsheet')
sheets = gsheets.sheet1.get_all_values()
df = pd.DataFrame(sheets[1:], columns=sheets[0]).set_index("doc_id")
df = df[["doc_type", "language_code", "title", "abstract", "url", "doc_date"]]

#####################################
# concatenate title and abstract
text = []
for index, row in df.iterrows():
  if row["abstract"].strip() != "":
    text.append(row["title"] + " | " + row["abstract"])
  else:
    text.append(row["title"])

df["text"] = text
del text

# Implement language detection

In [None]:
##########################################################
# load models
import langdetect
import ftlangdetect
from langcodes import Language, standardize_tag

# Function to detect language using FastText or Langdetect
def detect_language(text, model_name = 'fasttext'):
  try:
    if model_name == "fasttext":
        predictions = ftlangdetect.detect(text)
        detected_language = predictions["lang"]
        # Convert the detected language code to ISO 639-1 standard
        detected_language = standardize_tag(detected_language, macro=True)
    elif model_name == "langdetect":
        if len(text) < 2:  # Set a threshold for short texts
            detected_language = "??"
        else:
            detected_language = langdetect.detect(text)
            # Convert the detected language code to ISO 639-1 standard
            detected_language = standardize_tag(detected_language, macro=True)
    else:
        raise ValueError("Invalid model. Use 'fasttext' or 'langdetect'.")
  except:
    return "??"

  # convert special language codes that couldn't be standardized
  #special_langs = {"pes-Arab": "ar", "yue-Hant": "tr", "ko-Hang": "ko"}
  #detected_language = special_langs.get(detected_language) or detected_language
  detected_language = detected_language.upper()
  detected_language = detected_language.split("-")
  detected_language = detected_language[0]

  return detected_language

In [None]:
############################################
# test some minimal examples
for model in ["fasttext", "langdetect"]:
  print(detect_language("Hello world", model_name = model))
  print(detect_language("Hallo Welt", model_name = model))
  print(detect_language("Bonjour le monde", model_name = model))

EN
DE
FR
NL
DE
FR


# Benchmark language detection, fasttext vs. langdetect

In [None]:
N = df.shape[0]
print("Performance for language detection, in examples per second")
print("# of texts", N)
print()

# Measure runtime for language detection using FastText
try:
    start_time_detect_fasttext = time.time()  # Capture start time
    df["language_code_fasttext_title"] = df["title"].apply(lambda x: detect_language(x, "fasttext"))
    end_time_detect_fasttext = time.time()  # Capture end time

    # Title + Abstract combined language detection
    combined_start_time_detect_fasttext = time.time()  # Capture start time for concatenated columns
    df["language_code_fasttext"] = df['text'].apply(lambda x: detect_language(x, "fasttext"))
    combined_end_time_detect_fasttext = time.time()  # Capture end time for concatenated columns

    total_time_detect_fasttext = end_time_detect_fasttext - start_time_detect_fasttext
    print("FastText (title only):", round(N/total_time_detect_fasttext, 1))
    total_time_combined_cols_detect_fasttext = combined_end_time_detect_fasttext - combined_start_time_detect_fasttext
    print("FastText:", round(N/total_time_combined_cols_detect_fasttext, 1))
    print()
except KeyError as pes:
    pass

# Measure runtime for language detection using Langdetect
try:
    start_time_detect_langdetect = time.time()  # Capture start time
    df["language_code_langdetect_title"] = df["title"].apply(lambda x: detect_language(x, "langdetect"))
    end_time_detect_langdetect = time.time()  # Capture end time

    # Title + Abstract combined language detection
    combined_start_time_detect_langdetect = time.time()  # Capture start time for concatenated columns
    df["language_code_langdetect"] = df['text'].apply(lambda x: detect_language(x, "langdetect"))
    combined_end_time_detect_langdetect = time.time()  # Capture end time for concatenated columns

    total_time_detect_langdetect = end_time_detect_langdetect - start_time_detect_langdetect
    print("Langdetect (title only):", round(N/total_time_detect_langdetect, 1))
    total_time_combined_cols_detect_langdetect = combined_end_time_detect_langdetect - combined_start_time_detect_langdetect
    print("Langdetect:", round(N/total_time_combined_cols_detect_langdetect, 1))

except KeyError as pes:
    pass

Performance for language detection, in examples per second
# of texts 1724

FastText (title only): 30780.3
FastText: 9366.9

Langdetect (title only): 106.5
Langdetect: 108.2


In [None]:
################################################
# extract test dataset and inspect results
Y = df[df["language_code"] != "??"]
Y = Y[Y["language_code"] != "ZZ"]
Y = Y[["text", "language_code", "language_code_langdetect_title", "language_code_langdetect", "language_code_fasttext_title", "language_code_fasttext"]]
Y.sample(n=10, random_state=1234)

Unnamed: 0_level_0,text,language_code,language_code_langdetect_title,language_code_langdetect,language_code_fasttext_title,language_code_fasttext
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1240830421,一株乳酸乳球菌(Lactococcus lactis)SPL018及获取方法与应用,CA,CA,CA,JA,JA
1170872771,LIXIVIACIÓN EN PILAS,TL,CA,FIL,ES,ES
1233896914,Cubitera,IT,ES,ES,IT,IT
1180122718,CONJUNTO DE ALMACENAMIENTO Y GESTION DE ENERGI...,SO,CY,CY,PT,PT
1170728124,一种堆垛机立柱和Miniload高速堆垛机,RO,RO,RO,ZH,ZH
1255186685,一种含flupyroxystrobin的杀虫组合物,LT,LT,LT,ZH,ZH
1246518930,一种基于z-wave的KTV灯光控制方法,SW,SW,SW,ZH,ZH
1174766354,一种基于Gitlab CI落地DevOps的方法和系统,ET,ET,ET,ZH,ZH
1243317414,Balai d'essuie-glace notamment pour un véhicul...,FR,FR,FR,FR,FR
1223904860,METODO Y SISTEMA PARA LA DETECCION Y LUCHA DIS...,CY,CY,CY,ES,ES


In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy language detection")
print()
print("fasttext:", round(accuracy_score(Y["language_code"], Y["language_code_fasttext"]), 3))
print("fasttext (title only):", round(accuracy_score(Y["language_code"], Y["language_code_fasttext_title"]), 3))
print()
print("langdetect:", round(accuracy_score(Y["language_code"], Y["language_code_langdetect"]), 3))
print("langdetect (title only):", round(accuracy_score(Y["language_code"], Y["language_code_langdetect_title"]), 3))

Accuracy language detection

fasttext: 0.385
fasttext (title only): 0.327

langdetect: 0.863
langdetect (title only): 0.75


# Implement language translation to English

In [None]:
from transformers import pipeline

model_name = "facebook/m2m100_418M"
#model_name = "alirezamsh/small100" # doesn't work
######################################################
# set device = 'cuda' in order to use the GPU
translator = pipeline(task='translation', model=model_name, device = 'cuda', tgt_lang="en", max_length = 400)

In [None]:
# Create a function that translates a text given the language code
# if language code isn't proper, it is being detected
def translate_text(text, language_code = None, langdetect_model = "langdetect"):
  if language_code == None:
    language_code = "??"

  language_code = language_code.lower()

  if language_code != "en":
    try:
      translator("", src_lang=language_code)
    except:
      print("Unknown language code", language_code, "- will use language detection")
      language_code = detect_language(text, langdetect_model).lower()
      print("Detected language_code:", language_code)

      try:
        translator("", src_lang=language_code)
      except:
        print("No translator for language code", language_code, "available")
        language_code = "??"

  if text.strip() == "":
    translated_text = ""
  # don't return translation if language is unknown
  elif language_code == "??":
    translated_text = None
  # return original text if already in English
  elif language_code == "en":
    translated_text = text
  else:
    # split text into sentences
    text = sent_tokenize(text)
    translated_text = translator(text, src_lang=language_code)
    translated_text = [x["translation_text"] for x in translated_text]
    translated_text = " ".join(translated_text)

  return {"translation": translated_text, "language_code": language_code.upper()}

In [None]:
#############################
# test translator function
# on minimal examples
print(translate_text("Hallo Welt"))
print(translate_text("Bonjour le monde", language_code = "FR"))
print(translate_text("Hello world", language_code = "EN"))

Unknown language code ?? - will use language detection
Detected language_code: de
{'translation': 'Hello world', 'language_code': 'DE'}
{'translation': 'Hello to the world', 'language_code': 'FR'}
{'translation': 'Hello world', 'language_code': 'EN'}


# Benchmark runtime for machine translation

In [None]:
#############################################
# prep smaller sample to reduce total runtime
X = df[["doc_type", "title", "abstract", "language_code", "language_code_langdetect"]]
X = X[X["language_code_langdetect"] != "EN"]
X = X.sample(n = 100, random_state = 3141)

In [None]:
title_only = False

#############################
# run translation
title_en = []
abstract_en = []
language_code_final = []

from tqdm import tqdm
import timeit
tic = timeit.default_timer()
for index, row in tqdm(X.iterrows()):
  language_code = row["language_code"]
  if language_code in ["ZZ", "??"]:
    language_code = row["language_code_langdetect"]

  language_code_final.append(language_code)
  title_en.append(translate_text(row["title"], language_code)["translation"])
  if not title_only:
    abstract_en.append(translate_text(row["abstract"], language_code)["translation"])
  else:
    abstract_en.append("")

X["language_code_final"] = language_code_final
X["title_en"] = title_en
X["abstract_en"] = abstract_en

toc = timeit.default_timer()
runtime = round(toc - tic, 3)
print()
print('Runtime translation: ', runtime, ' secs.')
print('Performance: ', round(X.shape[0] / runtime, 2), ' ex/secs.')

100it [05:18,  3.18s/it]


Runtime translation:  318.139  secs.
Performance:  0.31  ex/secs.





# Inspect translation results

In [None]:
X1 = X[X["doc_type"]=="SCIENCE"]
X1[["language_code_final", "title", "title_en"]].head()

Unnamed: 0_level_0,language_code_final,title,title_en
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
943310346,IT,Approccio terapeutico nel feocromocitoma/pagan...,Therapeutic approach in metastatic feochromocy...
369308494,AF,Book Reviews,Book Reviews
369425039,ES,Placentitis bacteriana como causa de aborto en...,Bacterial placentitis as the cause of abortion...
529926697,PT,Explorando limites epistemológicos e políticos...,Exploring the epistemological and political bo...


In [None]:
X1 = X[X["doc_type"]=="PATENT"]
X1[["language_code", "title", "title_en"]].head()

Unnamed: 0_level_0,language_code,title,title_en
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1250520938,SW,40KW储能机组,40 kW of energy storage
1170854109,IT,Manguito de agarre para cateter urinario,This is the most important thing that you can ...
1245371956,KO,一种多孔钙钛矿型催化剂LaCo0.7Fe0.3O3及其制备方法和应用,A variety of calcium and mineral catalysts LaC...
1214896198,DA,"EMBALLAGE, DER ER KONFIGURERET TIL AT INDEHOLD...",Packaging that is configured to contain produc...
1244520714,ET,基于Logistic函数填充相似性矩阵的miRNA-疾病关联预测方法,基于Logistic function填充相似性矩阵的miRNA- disease-rela...


In [None]:
X1 = X[X["doc_type"]=="NEWS"]
X1[["language_code_final", "title", "title_en"]].head()

Unnamed: 0_level_0,language_code_final,title,title_en
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1242277828,AR,مؤسسة نهر الأردن تجدد اتفاقية تعاونها مع مجموع...,Jordan River Foundation renews cooperation agr...
1159770271,PT,Por que Reino Unido quer banir TikTok de celul...,Why the UK wants to ban TikTok from ministers’...
1256311518,DE,Planer/in (weiblich/männlich/divers) zur Beset...,Planner/in (women/man/divers) for the occupati...
1156023986,IT,"Strage di Crotone, la direttiva mai abrogata d...","It’s a good thing to do, but it’s a good thing..."
1036440645,RU,Банк России готов разрешить майнерам продавать...,The Bank of Russia is ready to allow miners to...
