In [None]:
!pip install torch==1.9.0 fastapi uvicorn langid fasttext langdetect requests tqdm

In [25]:
from pydantic import BaseModel, Field
from typing import Union, List, Dict

class LanguageSearchOption(BaseModel):
    texts: Union[str, List[str]] = Field(example="text")
    model_name: str = Field(default="fasttext")
    max_value: int = Field(default=3)

class LanguageResponseModel(BaseModel):
    body: List[Dict[str, List[Dict[str, float]]]]
    status: int = Field(example=200, description="Status of the request")
    message: str = Field(
        example="message", description="Attached message for the request"
    )

In [26]:
import os 
import requests
import tqdm

def http_get(url, path):
    if os.path.dirname(path) != '':
        os.makedirs(os.path.dirname(path), exist_ok=True)

    req = requests.get(url, stream=True)
    if req.status_code != 200:
        print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
        req.raise_for_status()
        return

    download_filepath = path+"_part"
    with open(download_filepath, "wb") as file_binary:
        content_length = req.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None
        progress = tqdm.tqdm(unit="B", total=total, unit_scale=True)
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                progress.update(len(chunk))
                file_binary.write(chunk)

    os.rename(download_filepath, path)
    progress.close()

In [27]:
import os
from typing import List, Union, Tuple
import torch

class LanguageDetector:
    def __init__(self, model_name: str = "FastText", cache_folder: str = None):
        self._fasttext_lang_id = None
        if model_name.lower() == "fasttext":
          self._lang_detectors = self.language_detection_fasttext
        elif model_name == "langdetect":
          self._lang_detectors = self.language_detection_langdetect
        elif model_name == "langid":
          self._lang_detectors = self.language_detection_langid
        else:
          raise ValueError("unknown option")

        if cache_folder is None:
            if 'LD_CACHE' in os.environ:
                cache_folder = os.environ['LD_CACHE']
            else:
                cache_folder = os.path.join(torch.hub._get_torch_home(), 'ld_v2')
        self._cache_folder = cache_folder

    def language_detection_fasttext(self, text: str, number:int= 3) -> Tuple[str, float]:
        if self._fasttext_lang_id is None:
          import fasttext
          fasttext.FastText.eprint = lambda x: None
          model_path = os.path.join(self._cache_folder, 'lid.176.ftz')
          self._fasttext_lang_id = fasttext.load_model(model_path)

        prediction = self._fasttext_lang_id.predict(text.lower().replace("\r\n", " ").replace("\n", " ").strip(), k=number)
        res = []
        for i in range(number):
          res.append({prediction[0][i].split('__')[-1] : prediction[1][i]})
        return res

    def language_detection_langdetect(self, text: str, number:int=3) -> str:
        import langdetect
        res = []
        for lang in langdetect.detect_langs(text.lower()):
          info_pred = str(lang).split(":")
          if len(res) < number:
            res.append({info_pred[0] : info_pred[1]})
          else:
            break
        return res

    def language_detection_langid(self, text: str, number:int= 3) -> str:
        import langid
        if not hasattr(self, "_langid_identifier"):
          self._langid_identifier = langid.langid.LanguageIdentifier.from_modelstring(langid.langid.model, norm_probs=True)
        identifier = self._langid_identifier
        res = identifier.rank(text.lower().replace("\r\n", " ").replace("\n", " ").strip())
        predictions = [{pred[0]: pred[1]} for pred in res[:number]]
        return predictions

    def language_detection(self, texts: Union[str, List[str]], max_value: int) -> str:
        if isinstance(texts, list):
            return [{doc : self.language_detection(texts=doc,max_value=max_value)} for doc in texts]
        try:
          return self._lang_detectors(text=texts, number=max_value)
        except:
          raise Exception("This method might not be installed, please check with the dev")


In [28]:
if 'LD_CACHE' in os.environ:
    CACHE_FOLDER = os.environ['LD_CACHE']
else:
    CACHE_FOLDER = os.path.join(torch.hub._get_torch_home(), 'ld_v2')
model_path = os.path.join(CACHE_FOLDER, 'lid.176.ftz')

if not os.path.exists(model_path):
    http_get('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz', model_path)

In [29]:
options: LanguageSearchOption = LanguageSearchOption(
    texts="This is a test",
    model_name="fasttext",
    max_value=3
)

language_detector = LanguageDetector(model_name=options.model_name)
res = language_detector.language_detection(texts=options.texts, max_value=options.max_value)

In [30]:
print(res)

[{'en': 0.9818459153175354}, {'bn': 0.002121715107932687}, {'hi': 0.0012636820320039988}]
