## Topic Modeling for Language Classification

In [75]:
# Converts the unicode file to ascii
import re

import numpy as np
import pyLDAvis
import unicodedata
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

  and should_run_async(code)


In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = w.strip().lower()

    # creating a space between a word and the punctuation following it eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping
    # -punctuation
    w = re.sub(r"([?.!,¿#@0-9])", r"", w)

    return w

In [60]:
import pandas as pd


family = pd.read_csv('family.csv')

def get_family(language):

    family['unit'] = family['unit'].apply(lambda sent:unicode_to_ascii(sent).lower().split(' ')[0])

    return family.loc[family['unit']==language]["family"].astype(str)

In [88]:
samples = pd.read_csv('datasets/samples.csv')
samples = samples[samples['LANG'] != 'portuguese']
X = samples['TEXT'].apply(preprocess_sentence).to_numpy()
y = samples['LANG'].to_numpy()

y_family = list(map(get_family, y))



X, y

  and should_run_async(code)


(array(['yrome taroino kapu ae ayhtohpyry porohnõko mana ritonõpo apotunuru wino emero motye jamihme ritonõpo omiry mana jomiry roropa isaaro jamihme mana tykase jezu eya xine',
        'ynara tykase jezu eya xine —aparão enurupyra ro ahtao nae ro exine tykase jezu eya xine',
        'moroto awahtao xine tuaro ehtoko jomiry kurã ekarotohme imehnomo a moroto matose',
        ...,
        "'re ĩhâimana u'âsi mono hã wamama hã te te 're wa'azawi pese te duré te te 're wa'apawapto wapẽ'ẽ 're 're wanomro wẽ u'âsi mono da wahâimana nhiptete sina wanhimi'ẽ na sô 're wanhimipari u'âsi za'ra mono da ĩwẽ zô taha wa wa za wamama wi wa te 're rowaptẽrẽ u'âsi za'ra duré wanhib'apito zezu cristuhu wi zama ato sina 're aihâimana za'ra wa'aba mono da duré asiptete sina ĩwẽ na si tãma 're romhuri pese za'ra wa'aba mono da asimirowasu'u wẽ za'ra wa'aba na",
        "'re ĩhâimana u'âsi mono hã ma tô ĩ̱pisutu ĩsib'a'uwẽ norĩ te tãma 're apawapto mono da ai'udâna za'ra wa'aba te 're romhur

In [90]:
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.datasets import make_multilabel_classification
# This produces a feature matrix of token counts, similar to what
# CountVectorizer would produce on text.
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(2, 3))
print(X)
X_transformed = tfidf.fit_transform(X)
lda = LatentDirichletAllocation(n_components=15,
                                random_state=0)
lda.fit(X_transformed)

# get topics for some given samples:
topics_probs = lda.transform(X_transformed[-1])

print(np.argmax(topics_probs))



  and should_run_async(code)


['yrome taroino kapu ae ayhtohpyry porohnõko mana ritonõpo apotunuru wino emero motye jamihme ritonõpo omiry mana jomiry roropa isaaro jamihme mana tykase jezu eya xine'
 'ynara tykase jezu eya xine —aparão enurupyra ro ahtao nae ro exine tykase jezu eya xine'
 'moroto awahtao xine tuaro ehtoko jomiry kurã ekarotohme imehnomo a moroto matose'
 ...
 "'re ĩhâimana u'âsi mono hã wamama hã te te 're wa'azawi pese te duré te te 're wa'apawapto wapẽ'ẽ 're 're wanomro wẽ u'âsi mono da wahâimana nhiptete sina wanhimi'ẽ na sô 're wanhimipari u'âsi za'ra mono da ĩwẽ zô taha wa wa za wamama wi wa te 're rowaptẽrẽ u'âsi za'ra duré wanhib'apito zezu cristuhu wi zama ato sina 're aihâimana za'ra wa'aba mono da duré asiptete sina ĩwẽ na si tãma 're romhuri pese za'ra wa'aba mono da asimirowasu'u wẽ za'ra wa'aba na"
 "'re ĩhâimana u'âsi mono hã ma tô ĩ̱pisutu ĩsib'a'uwẽ norĩ te tãma 're apawapto mono da ai'udâna za'ra wa'aba te 're romhuri mono da ma tô romhuri hã ĩ̱ma tisõ uburé ĩs

In [98]:
topics_probs = lda.transform(X_transformed)
print(topics_probs)
family_pred = list(map(lambda probs:family['family'][np.argmax(probs)], topics_probs))

print(lda.components_.shape)
print(X.shape)

  and should_run_async(code)


[[0.00850983 0.00850983 0.00850983 ... 0.00850983 0.00850983 0.00850983]
 [0.01153065 0.01153065 0.01153065 ... 0.01153065 0.01153065 0.01153065]
 [0.01245393 0.01245393 0.01245393 ... 0.01245393 0.01245393 0.82564501]
 ...
 [0.00503063 0.00503063 0.00503063 ... 0.00503063 0.00503063 0.92957124]
 [0.00661592 0.00661592 0.00661592 ... 0.00661592 0.00661594 0.3503952 ]
 [0.00540454 0.00540454 0.00540454 ... 0.00540455 0.00540455 0.32666825]]
(15, 187399)
(2600,)


In [102]:
def get_num_words(doc):
    return len(word_tokenize(doc))

doc_length = list(map(get_num_words, X))

prep_data = pyLDAvis.prepare(lda.components_, topics_probs, doc_length, tfidf.vocabulary_.keys(), tfidf.vocabulary_.values())
pyLDAvis.display(prep_data)

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(


AttributeError: 'HTML' object has no attribute 'save_html'

In [104]:
pyLDAvis.save_html(prep_data, 'lda.html')

  and should_run_async(code)
