## CLD3

In [1]:
import cld3
# The module used is : https://github.com/bsolomon1124/pycld3. See how to install here.
#!pip install cpython==3.5
#!brew update
#!brew upgrade protobuf || brew install -v protobuf
#!python -m pip install -U pycld3

cld3 supports two functions - get_language and get_frequent_languages. We'll see how they perform

In [2]:
cld3.get_language("Je veux que: https://site.english.com/this/is/a/url/path/component#fragment")

LanguagePrediction(language='en', probability=0.5319557189941406, is_reliable=False, proportion=1.0)

In [5]:
cld3.get_frequent_languages("Je veux que: https://site.english.com/this/is/a/url/path/component#fragment", num_langs=3)

[LanguagePrediction(language='en', probability=0.5319557189941406, is_reliable=False, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

### Transliterated text

Now let's try to predict some transliterated text using this. The library supports the following languages:

    "eo", "co", "eu", "ta", "de", "mt", "ps", "te", "su", "uz", "zh-Latn", "ne",
    "nl", "sw", "sq", "hmn", "ja", "no", "mn", "so", "ko", "th", "kk", "sl",
    "ig", "mr", "zu", "ml", "hr", "bs", "lo", "sd", "cy", "hy", "uk", "pt",
    "yi", "lv", "iw", "cs", "vi", "jv", "be", "km", "mk", "tr", "am", "zh",
    "da", "sv", "fi", "ht", "af", "la", "id", "fil", "sm", "ca", "el", "ka",
    "sr", "it", "sk", "ru", "ru-Latn", "bg", "ny", "fa", "fy", "haw", "gl",
    "et", "ms", "gd", "bg-Latn", "ha", "is", "ur", "mi", "hi", "bn", "hi-Latn",
    "fr", "hu", "xh", "my", "tg", "ro", "ar", "lb", "el-Latn", "st", "ceb",
    "kn", "az", "si", "ky", "mg", "en", "gu", "es", "pl", "ja-Latn", "ga", "lt",
    "sn", "yo", "pa", "ku",


In [8]:
# Some languages are easy to understand because they're available in this library
cld3.get_language("Privet, kak tebya zovut?")

[LanguagePrediction(language='ru-Latn', probability=0.8417865633964539, is_reliable=True, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

In [12]:
# Russian native script for "I want to go to the market"
cld3.get_frequent_languages("Я иду на рынок сегодня", num_langs=4)

[LanguagePrediction(language='ru', probability=0.995514452457428, is_reliable=True, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

In [7]:
# This is a text piece in Hindi transliterated to Roman characters, should support it, but doesn't do a good job
cld3.get_frequent_languages("Main Madhuri Dixit banna chahti hoon", num_langs=4)
# Predicts Finnish

[LanguagePrediction(language='fi', probability=0.47270092368125916, is_reliable=False, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

In [9]:
# This is transliterated Hindi for "How are you? I'm good"
cld3.get_frequent_languages("Kya haal hai? Main achhi hoon", num_langs=4)
# Gaelic is the prediction

[LanguagePrediction(language='gd', probability=0.4288159906864166, is_reliable=False, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

In [11]:
# Swahili native script for "I want to go to the market"
cld3.get_frequent_languages("Naenda kwa alama leo", num_langs=5)

[LanguagePrediction(language='sw', probability=0.9990677237510681, is_reliable=True, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

In [14]:
# Let's try mix and match - short phrase in Spanglish meaning "but like"
cld3.get_frequent_languages("Pero like", num_langs=2)
# Predicts Maori - a language used by an indigenous group in NZ

[LanguagePrediction(language='mi', probability=0.8353496193885803, is_reliable=True, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

In [16]:
cld3.get_frequent_languages("Cojelo con take it easy", num_langs=2)

[LanguagePrediction(language='en', probability=0.41589194536209106, is_reliable=False, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

In [20]:
# Franglais
cld3.get_frequent_languages("Ce week-end va être super cool.", num_langs=2)
# Predicts catalan- western Romance language derived from Latin

[LanguagePrediction(language='ca', probability=0.5457612872123718, is_reliable=False, proportion=1.0),
 LanguagePrediction(language='und', probability=0.0, is_reliable=False, proportion=0.0)]

## LangID

In [None]:
# https://github.com/saffsd/langid.py
# This library does NOT support any transliterated languages. Uncomment the below line to install
#!pip install langid
#!mv langid.py/ langid/
# the above line is optional

In [13]:
!cd langid/

In [14]:
from langid import langid

In [15]:
langid.classify("Hello! How are you this fine day?")

('en', -45.242400884628296)

In [19]:
from langid.langid.langid import LanguageIdentifier, model

In [20]:
# Probability Normalization
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
identifier.classify("Hello! How are you this fine day?")

('en', 0.9999998136419443)

In [22]:
identifier.classify("Je veux que: https://site.english.com/this/is/a/url/path/component#fragment")

('fr', 0.9999992773657777)

In [23]:
# Expected: Swahili, Predicted: Malay
identifier.classify("Naenda kwa alama leo")

('ms', 0.5526980270071106)

In [24]:
# Let's try it on a Hindi transliterated text
identifier.classify("Main Madhuri Dixit banna chahti hoon")
# Predicts Irish

('ga', 0.5746522148102053)

In [25]:
# Does well on original script
identifier.classify("अगर आप हिंदी में जानकारी पढ़ना पसंद करते हैं ")

('hi', 0.9999999749526877)

### Set languages


Let's say you know the dataset can only be in a few languages, with this function you can restrict the set

In [31]:
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
identifier.set_languages(['ru', 'en', 'it', 'sk'])
identifier.classify("Privet, kak tebya zovut?")

('sk', 0.66767735005164)

In [29]:
identifier.set_languages(['hi','en'])

In [30]:
identifier.classify("Main Madhuri Dixit banna chahti hoon")

('en', 0.999999999991774)

## Langdetect

In [50]:
#!pip install langdetect
from langdetect import detect_langs, detect

Supports these 55 languages:
```
af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw
```

In [51]:
detect_langs("Je veux que: https://site.english.com/this/is/a/url/path/component#fragment")

[fr:0.9999969224503005]

In [52]:
detect_langs("Privet, kak tebya zovut?")
# Predicts Slovakian, Albanian, Hungarian

[sk:0.7142799432378928, sq:0.14285822332557807, hu:0.14285767370622154]

In [53]:
detect_langs("Privet, kak tebya zovut?")

[sk:0.4081787863462692,
 sl:0.30518415356009926,
 hu:0.14285673157416331,
 sq:0.14285645846989373]

*Be careful!* This is a non-deterministic library - so it will change predictions every time you run

In [54]:
from langdetect import DetectorFactory
DetectorFactory.seed = 0

In [55]:
detect_langs("Privet, kak tebya zovut?")

[hr:0.2857142151330604,
 sk:0.28571326273782804,
 sq:0.2857128308459763,
 hu:0.1428565021344586]

In [56]:
detect_langs("Privet, kak tebya zovut?")

[hr:0.2857142151330604,
 sk:0.28571326273782804,
 sq:0.2857128308459763,
 hu:0.1428565021344586]

In [57]:
detect_langs("Я иду на рынок сегодня")

[ru:0.9999959546567648]

In [58]:
detect_langs("Kya haal hai? Main achhi hoon")
# Hindi transliterated is detected as Somali

[so:0.9999979650989328]

## Annotated Datasets

Often what we find is that while we can gather data in one particular language very easily, thanks to well established corpora in the NLP literature, it is extremely difficult to detect transliterated text in Tweets or anywhere on the internet because of the problems shown above.

To solve this and have a dataset of good size, it is often required to build your own dataset and manually annotate it. To build a transliterated dataset, it is often easy to go from the native script to the Roman script by having simple rules. For example in the section below we explain how we can do this. Using the ISO codes we can convert all text pieces in native script and then train the model to either distinguish English vs Lang(Roman script) or simply train a language model to feed to a model to generate text.

Once you have a model that is able to recognize transliterated text, you could use a web crawler to collect text on the internet that has a high probability of being transliterated/code-switched.

In [5]:
# I create a very simple transliterator here for Hindi
import json
CONSONANTS = [
        "क",
        "च",
        "ट",
        "त",
        "प",
        "ख",
        "छ",
        "ठ",
        "थ",
        "फ",
        "ग",
        "ज",
        "ड",
        "द",
        "ब",
        "घ",
        "झ",
        "ढ",
        "ध",
        "भ",
        "न",
        "म",
        "ह",
        "य",
        "र",
        "ल",
        "व",
        "श",
        "ष",
        "स",
        ]

In [6]:
class Transliterator(object):

    def __init__(self):
        self.load_rules()

    def load_rules(self):
        with open("../rules.json", 'r', encoding='utf-8') as f:
            self.rules = json.load(f)
            
        
        self.consonants = CONSONANTS
        self.ein = ["े"]
        self.n_sounds = ["ं", "ँ"]

    def convert(self, string):
        converted = []
        for word in string.split(' '):
            res = ""
            for i, letter in enumerate(word):
                op = self.rules.get(letter, "")

                if isinstance(op, list):
                    op = op[0]
                if i  < len(word) - 1 :
                    if letter in self.consonants and word[i+1] in self.consonants:
                        op += "a"
                    if letter in self.ein and word[i+1] in self.n_sounds:
                        op += "i"
                if i == len(word) - 1 and letter == "ए":
                    op = "ye"
                res += op
            converted.append(res)
        return " ".join(converted)


In [7]:
translit = Transliterator()

In [10]:
# Here we transliterate a sentence from the Hindi Wiki page for Python
translit.convert("पाइथन एक सामान्य कार्यों के लिए उपयुक्त, उच्च स्तरीय प्रोग्रामिंग भाषा, इन्टरैक्टिव, ऑब्जेक्ट ओरिएन्टेड, स्क्रिप्टिंग भाषा है। इस भाषा को इस तरह से डिजाइन किया गया है ताकि इसमें लिखे गए कोड आसानी से पढ़े और समझे जा सकें।")

'paithan ek samany karyon ke liye upayukt uchch stariy programing bhasha intaraiktiv aubjekt oriented skripting bhasha hai is bhasha ko is tarah se dijain kiya gaya hai taki isamein likhe gye kod aasani se padhe aur samajhe ja sakein'

#### Indic language detector

For your own language you could probably find sophisticated detectors already and if you do, you could either use their models or fine-tune them. For Hindi, I found [this](https://github.com/irshadbhat/csnli) great transliterated text detector that also reverse translate the Roman scripted text back to its original script. If you're interested, you can go and play with their code, it has models for NMT between Hindi script (Devanagari) and Roman scripted Hindi.