# Linguistica - Unsupervised Learning of Morphology

https://linguistica-uchicago.github.io/lxa5/

installing linguistica

In [None]:
!pip install linguistica


Collecting linguistica
  Downloading linguistica-5.2.1-py2.py3-none-any.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: linguistica
Successfully installed linguistica-5.2.1


The corpus file is in the google drive.

Mounting the google drive.

In [None]:
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filepath = "/content/drive/My Drive/UniKonstanz/South Asian NLP/data/"


Reading the corpus.

It is better that the corpus is pre-pocessed as the urdu/arabic punctuation marks remain attached with the words in the current processing.
(Note: It does not matter for the current file as that is in Roman script.)

In [None]:
import linguistica as lxa

lxa_ur = lxa.read_corpus(filepath+"roman_urdu_news.txt", max_affix_length=5, min_stem_length = 3, min_sig_count=3, max_word_tokens=15000)
lxa_ta = lxa.read_corpus(filepath+"Tamil_Roman_Text.txt", max_affix_length=5, min_stem_length = 3, min_sig_count=3, max_word_tokens=15000)


# **Tamil Roman Text**

In [None]:
lxa_ta.parameters()

{'max_word_tokens': 15000,
 'min_stem_length': 3,
 'max_affix_length': 5,
 'min_sig_count': 3,
 'n_neighbors': 9,
 'n_eigenvectors': 11,
 'min_context_count': 3,
 'max_word_types': 1000,
 'suffixing': 1,
 'keep_case': 0}

In [None]:
lxa_ta.affixes()

{'NULL',
 'a',
 'aa',
 'aaa',
 'ae',
 'aga',
 'ala',
 'am',
 'an',
 'ana',
 'da',
 'di',
 'e',
 'ed',
 'en',
 'ga',
 'h',
 'ha',
 'hu',
 'i',
 'ing',
 'k',
 'ka',
 'ku',
 'la',
 'luku',
 'lum',
 'ly',
 'm',
 'n',
 'na',
 'nga',
 'ngala',
 'nu',
 'num',
 'o',
 'r',
 's',
 'ss',
 'than',
 'thu',
 'u',
 'uku',
 'um',
 'va',
 'w',
 'y',
 'ya',
 'ye',
 'yum'}

**Urdu Roman Text**

In [None]:
lxa_ta.signatures()

{('NULL', 'a'),
 ('NULL', 'aa'),
 ('NULL', 'aaa'),
 ('NULL', 'am'),
 ('NULL', 'da'),
 ('NULL', 'di'),
 ('NULL', 'e'),
 ('NULL', 'ed'),
 ('NULL', 'ga'),
 ('NULL', 'h'),
 ('NULL', 'ha'),
 ('NULL', 'hu'),
 ('NULL', 'i'),
 ('NULL', 'ing'),
 ('NULL', 'k'),
 ('NULL', 'ku'),
 ('NULL', 'la'),
 ('NULL', 'luku'),
 ('NULL', 'lum'),
 ('NULL', 'ly'),
 ('NULL', 'm'),
 ('NULL', 'm', 'nu'),
 ('NULL', 'n'),
 ('NULL', 'na'),
 ('NULL', 'nga'),
 ('NULL', 'nu'),
 ('NULL', 'num'),
 ('NULL', 'r'),
 ('NULL', 's'),
 ('NULL', 'ss'),
 ('NULL', 'than'),
 ('NULL', 'thu'),
 ('NULL', 'u'),
 ('NULL', 'uku'),
 ('NULL', 'va'),
 ('NULL', 'w'),
 ('NULL', 'y'),
 ('NULL', 'ya'),
 ('NULL', 'ye'),
 ('NULL', 'yum'),
 ('a', 'e'),
 ('a', 'i'),
 ('a', 'o'),
 ('a', 'u'),
 ('ae', 'e'),
 ('aga', 'u'),
 ('ala', 'um'),
 ('am', 'm'),
 ('an', 'en'),
 ('an', 'n'),
 ('ana', 'na'),
 ('e', 'ing'),
 ('e', 'um'),
 ('ga', 'ka'),
 ('ga', 'nga'),
 ('ga', 'nga', 'ngala'),
 ('i', 'u'),
 ('i', 'y'),
 ('ku', 'u'),
 ('m', 'n'),
 ('n', 'r')}

In [None]:
lxa_ta.signatures_to_stems()

{('a', 'e'): {'#thal',
  'agal',
  'dat',
  'eruking',
  'kitt',
  'kollam',
  'nadik',
  'niray',
  'soldravang',
  'vay'},
 ('NULL', 'a'): {'*tha',
  'ahaaa',
  'apdiye',
  'athigam',
  'dekh',
  'done',
  'inka',
  'innoruthar',
  'inoruthar',
  'irukum',
  'irundha',
  'ithan',
  'kaaga',
  'kamal',
  'kandipa',
  'kollaam',
  'lam',
  'laye',
  'leval',
  'likes',
  'maranam',
  'neram',
  'oruthar',
  'parunga',
  'ready',
  'record',
  'sanda',
  'sapda',
  'sethupathy',
  'shradha',
  'solluravanga',
  'super',
  'terika',
  'thalaivaaaaaaaaaaa',
  'thalavar',
  'theriyath',
  'varusam',
  'vegam',
  'vijay',
  'villain',
  'yan',
  'yennada'},
 ('NULL', 'k'): {'102', '400', '700', 'karthi'},
 ('NULL', 'aaa'): {'anna', 'daaaaaa', 'semmaaaa', 'thalaaaaaaaaa'},
 ('NULL', 'luku'): {'enga', 'pathavanga', 'yenga'},
 ('NULL', 'aa'): {'aaaa',
  'nna',
  'paaaaaaa',
  'pakkka',
  'pothum',
  'semmma',
  'verithanam'},
 ('NULL', 'la'): {'chennai',
  'cinema',
  'gandu',
  'irukinga',
  

The properties for the morphology learning. Some values are set above. Others have the default values.

In [None]:
lxa_ur.parameters()

{'max_word_tokens': 15000,
 'min_stem_length': 3,
 'max_affix_length': 5,
 'min_sig_count': 3,
 'n_neighbors': 9,
 'n_eigenvectors': 11,
 'min_context_count': 3,
 'max_word_types': 1000,
 'suffixing': 1,
 'keep_case': 0}

Listing all the affixes found.

In [None]:
lxa_ur.affixes()

{'00>',
 '01>',
 '02>',
 '03>',
 '04>',
 '05>',
 '06>',
 '07>',
 '08>',
 '09>',
 '0>',
 '10>',
 '11>',
 '12>',
 '13>',
 '14>',
 '15>',
 '16>',
 '17>',
 '18>',
 '19>',
 '1>',
 '20>',
 '21>',
 '22>',
 '23>',
 '24>',
 '25>',
 '26>',
 '27>',
 '28>',
 '29>',
 '2>',
 '30>',
 '31>',
 '32>',
 '33>',
 '34>',
 '35>',
 '36>',
 '37>',
 '38>',
 '39>',
 '3>',
 '40>',
 '41>',
 '42>',
 '43>',
 '44>',
 '45>',
 '46>',
 '47>',
 '48>',
 '49>',
 '4>',
 '50>',
 '51>',
 '52>',
 '53>',
 '54>',
 '55>',
 '56>',
 '57>',
 '58>',
 '59>',
 '5>',
 '60>',
 '61>',
 '62>',
 '63>',
 '64>',
 '65>',
 '66>',
 '67>',
 '68>',
 '69>',
 '6>',
 '70>',
 '71>',
 '72>',
 '73>',
 '74>',
 '75>',
 '76>',
 '77>',
 '78>',
 '79>',
 '7>',
 '80>',
 '81>',
 '82>',
 '83>',
 '84>',
 '85>',
 '86>',
 '87>',
 '88>',
 '89>',
 '8>',
 '90>',
 '91>',
 '92>',
 '93>',
 '94>',
 '95>',
 '96>',
 '97>',
 '98>',
 '99>',
 '9>',
 '>',
 'NULL',
 'a',
 'at',
 'atī',
 'aṉ',
 'e',
 'h',
 'iī',
 'l',
 'ne',
 'ng',
 'rz',
 's',
 't',
 'uṉ',
 'z',
 'ī',
 'ئی',
 'ا

Listing all the signatures. Signatures are set of affixes that are attached with a wordset.

In [None]:
lxa_ur.signatures()

{('00>',
  '01>',
  '02>',
  '03>',
  '04>',
  '05>',
  '06>',
  '07>',
  '08>',
  '09>',
  '0>',
  '10>',
  '11>',
  '12>',
  '13>',
  '14>',
  '15>',
  '16>',
  '17>',
  '18>',
  '19>',
  '1>',
  '20>',
  '21>',
  '22>',
  '23>',
  '24>',
  '25>',
  '26>',
  '27>',
  '28>',
  '29>',
  '2>',
  '30>',
  '31>',
  '32>',
  '33>',
  '34>',
  '35>',
  '36>',
  '37>',
  '38>',
  '39>',
  '3>',
  '40>',
  '41>',
  '42>',
  '43>',
  '44>',
  '45>',
  '46>',
  '47>',
  '48>',
  '49>',
  '4>',
  '50>',
  '51>',
  '52>',
  '53>',
  '54>',
  '55>',
  '56>',
  '57>',
  '58>',
  '59>',
  '5>',
  '60>',
  '61>',
  '62>',
  '63>',
  '64>',
  '65>',
  '66>',
  '67>',
  '68>',
  '69>',
  '6>',
  '70>',
  '71>',
  '72>',
  '73>',
  '74>',
  '75>',
  '76>',
  '77>',
  '78>',
  '79>',
  '7>',
  '80>',
  '81>',
  '82>',
  '83>',
  '84>',
  '85>',
  '86>',
  '87>',
  '88>',
  '89>',
  '8>',
  '90>',
  '91>',
  '92>',
  '93>',
  '94>',
  '95>',
  '96>',
  '97>',
  '98>',
  '99>',
  '9>',
  '>'),
 ('0>', '1>'

The signatures with the stems attached with the signature.

In [None]:
lxa_ur.signatures_to_stems()

{('0>', '1>', '2>', '3>', '4>', '5>', '6>', '7>', '8>', '9>', '>'): {'<r10',
  '<r11',
  '<r12',
  '<r13',
  '<r14',
  '<r15',
  '<r16',
  '<r17',
  '<r18',
  '<r19',
  '<r20',
  '<r21',
  '<r22',
  '<r23',
  '<r24',
  '<r25',
  '<r26',
  '<r27',
  '<r28',
  '<r29',
  '<r30',
  '<r31',
  '<r32',
  '<r33',
  '<r34',
  '<r35',
  '<r36',
  '<r37',
  '<r38',
  '<r39',
  '<r40',
  '<r41',
  '<r42',
  '<r43',
  '<r44',
  '<r45',
  '<r46',
  '<r47',
  '<r48',
  '<r49',
  '<r50',
  '<r51',
  '<r52',
  '<r53',
  '<r54',
  '<r55',
  '<r6',
  '<r7',
  '<r8',
  '<r9',
  '<u10',
  '<u11',
  '<u12',
  '<u13',
  '<u14',
  '<u15',
  '<u16',
  '<u17',
  '<u18',
  '<u19',
  '<u20',
  '<u21',
  '<u22',
  '<u23',
  '<u24',
  '<u25',
  '<u26',
  '<u27',
  '<u28',
  '<u29',
  '<u30',
  '<u31',
  '<u32',
  '<u33',
  '<u34',
  '<u35',
  '<u36',
  '<u37',
  '<u38',
  '<u39',
  '<u40',
  '<u41',
  '<u42',
  '<u43',
  '<u44',
  '<u45',
  '<u46',
  '<u47',
  '<u48',
  '<u49',
  '<u50',
  '<u51',
  '<u52',
  '<u53

In [None]:
stem_sigs = lxa_ur.signatures_to_stems()


In [None]:
for i, itm in enumerate(stem_sigs):
  if len (itm) >2 and len(itm) < 8 :
    print(i, itm, stem_sigs[itm])

6 ('NULL', 'uṉ', 'ī') {'karubar', 'hkumt', 'kar', 'dar', 'tbdīl', 'khrīdar'}
7 ('e', 'h', 'uṉ') {'mns3ub', 'ma2ahd', 'sha2b'}
29 ('NULL', 'وں', 'ی') {'حکومت', 'خریدار', 'کاروبار', 'دار', 'تبدیل', 'کار'}
30 ('وں', 'ہ', 'ے') {'منصوب', 'معاہد', 'شعب'}


# Ployglot - using pre-trained morfessor models

https://polyglot.readthedocs.io/

Installing the libraries required by polyglot



In [None]:
!brew install icu4c
!export ICU_VERSION=58
!export PYICU_INCLUDES=/usr/local/Cellar/icu4c/58.2/include
!export PYICU_LFLAGS=-L/usr/local/Cellar/icu4c/58.2/lib
!pip install pyicu

/bin/bash: line 1: brew: command not found


In [None]:
!pip install pycld2


Collecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycld2
  Building wheel for pycld2 (setup.py) ... [?25l[?25hdone
  Created wheel for pycld2: filename=pycld2-0.41-cp310-cp310-linux_x86_64.whl size=9904066 sha256=c22eaebb3c5050c15962c681ab31138fc86a0b73f5f5af882847c4bdce532687
  Stored in directory: /root/.cache/pip/wheels/be/81/31/240c89c845e008a93d98542325270007de595bfd356eb0b06c
Successfully built pycld2
Installing collected packages: pycld2
Successfully installed pycld2-0.41


installing polyglot

In [None]:
!pip install polyglot


Collecting polyglot
  Downloading polyglot-16.7.4.tar.gz (126 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.3/126.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: polyglot
  Building wheel for polyglot (setup.py) ... [?25l[?25hdone
  Created wheel for polyglot: filename=polyglot-16.7.4-py2.py3-none-any.whl size=52562 sha256=7100189ad7e9b9fd39c486cd7d48a280db614441fad573984d0bd141cd97f84c
  Stored in directory: /root/.cache/pip/wheels/aa/92/4a/b172589446ba537db3bdb9a1f2204f27fe71217981c14ac368
Successfully built polyglot
Installing collected packages: polyglot
Successfully installed polyglot-16.7.4


In [None]:
!pip install morfessor

Collecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: morfessor
Successfully installed morfessor-2.0.6


In [None]:
!polyglot download morph2.ur

[polyglot_data] Downloading package morph2.ur to
[polyglot_data]     /root/polyglot_data...


In [None]:
!polyglot download morph2.hi

[polyglot_data] Downloading package morph2.hi to
[polyglot_data]     /root/polyglot_data...


In [None]:
from polyglot.text import Text, Word

processed = Text("لڑکیاں کتابیں لڑکوں ملکوں شہروں گرتا گرتے لکھواتی",'ur')

for w in processed.words:
  print(w.morphemes)

['لڑ', 'ک', 'یاں']
['کتاب', 'یں']
['لڑ', 'ک', 'وں']
['ملک', 'وں']
['شہر', 'وں']
['گر', 'تا']
['گر', 'تے']
['لکھوا', 'تی']


In [None]:

processed = Text("लड़कियां, किताबें , लड़कों , मूलकों , गिरता ,गीते ,गीति, लिखवाती ")

for w in processed.words:
  print(w.morphemes)

['लड़क', 'ियां']
[',']
['किताब', 'ें']
[',']
['लड़क', 'ों']
[',']
['मूल', 'कों']
[',']
['गिर', 'ता']
[',']
['गी', 'ते']
[',']
['गी', 'ति']
[',']
['लिख', 'वात', 'ी']


In [None]:
processed = Text("بعدازنمازمغرب کبوتراڑرہےتھے ",'ur')

for w in processed.words:
  print(w.morphemes)

['بعد', 'از', 'نماز', 'م', 'غرب']
['ک', 'بو', 'تر', 'اڑ', 'رہے', 'تھے']


In [None]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub<0.18,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface_hub, tokenizers
Successfully installed huggingface_hub-0.17.3 tokenizers-0.14.1


In [None]:
!pip install word-piece-tokenizer


Collecting word-piece-tokenizer
  Downloading word_piece_tokenizer-1.0.1-py3-none-any.whl (119 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/119.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: word-piece-tokenizer
Successfully installed word-piece-tokenizer-1.0.1


In [None]:
from word_piece_tokenizer import WordPieceTokenizer
tokenizer = WordPieceTokenizer()

ids = tokenizer.tokenize('reading a storybooks!')
# [101, 3752, 1037, 2466, 8654, 999, 102]

tokens = tokenizer.convert_ids_to_tokens(ids)
# ['[CLS]', 'reading', 'a', 'story', '##book', '!', '[SEP]']


tokens

['[CLS]', 'reading', 'a', 'story', '##books', '!', '[SEP]']