# Language Identification App Using Gradio

Anggota Kelompok:

2440016804 - Rio Pramana

2440087214 - Debora

2440030323 - Enrico Fernandez

## Install and Import Libraries

In [1]:
pip install -q gradio

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install -q datasets

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pickle
import pandas as pd
import numpy as np
import re
import warnings
import gradio as gr
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
from pprint import pprint
warnings.simplefilter("ignore")

## Define Functions to Load Model and Predict Input

In [4]:
lr_model = mnb_model = None
def load_all_models():
    with open("lr_model.pkl", "rb") as f:
        lr_model = pickle.load(f)
    with open("mnb_model_tfidf.pkl", "rb") as f:
        mnb_model = pickle.load(f)
    return lr_model, mnb_model

def clean_text(text):
    text = text.lower()
    text = re.sub(r'(@[A-Za-z0-9]+)|([!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}])|(\w+://\S+)|^rt|http.+?', '', text)
    return text

#Change label to something that the user understands
def transform_label_to_text(label):
    if label == 'ar': return "Arabic"
    elif label == 'bg': return "Bulgarian"
    elif label == 'de': return "German"
    elif label == 'el': return "Modern Greek"
    elif label == 'en': return "English"
    elif label == 'es': return "Spanish"
    elif label == 'fr': return "French"
    elif label == 'hi': return "Hindi"
    elif label == 'it': return "Italian"
    elif label == 'ja': return "Japanese"
    elif label == 'nl': return "Dutch"
    elif label == 'pl': return "Polish"
    elif label == 'pt': return "Portuguese"
    elif label == 'ru': return "Russian"
    elif label == 'sw': return "Swahili"
    elif label == 'th': return "Thai"
    elif label == 'tr': return "Turkish"
    elif label == 'ur': return "Urdu"
    elif label == 'vi': return "Vietnamese"
    elif label == 'zh': return "Chinese"
    return label
    
def predict_input(model_choice, input):
    input = clean_text(input)
    if model_choice == "Logistic Regression (One vs All)":
        model = lr_model
    elif model_choice == "Multinomial Naive Bayes":
        model = mnb_model
    lang = model.predict([input])
    lang = le.inverse_transform(lang)
    return transform_label_to_text(lang[0])

In [5]:
#Prepare everything before launching app
ds_train = load_dataset('papluca/language-identification', split='train')
ds_train = ds_train.to_pandas()
y_train = ds_train["labels"] #Need a dataset to use fit label encoder
lr_model, mnb_model = load_all_models()
le = LabelEncoder()
y_train = le.fit_transform(y_train)

Using custom data configuration papluca--language-identification-a54c87a573b2eb6b
Reusing dataset csv (C:\Users\User\.cache\huggingface\datasets\papluca___csv\papluca--language-identification-a54c87a573b2eb6b\0.0.0\51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


In [6]:
# To test if there's an error or not
# predict_input("Logistic Regression (One vs All)", "Does this work")
# predict_input("Multinomial Naive Bayes", "Does this work")

## App Making and Deployment Using Gradio

In [7]:
model_choice = gr.inputs.Dropdown(["Logistic Regression (One vs All)", "Multinomial Naive Bayes"])
app_inputs = gr.inputs.Textbox(lines=1, placeholder="Enter text here...")
demo = gr.Interface(fn=predict_input, inputs=[model_choice, app_inputs], 
                    outputs='text', 
                    title='Language Identification', 
                    description = 'This app is used to identify which language your input belongs to. You can choose between 2 models, each of which has an accuracy score of 99%. The result of the language identification will be shown on the output box on the right. Thank you! <3 \n\n This app can identify 20 languages, which are: arabic (ar), bulgarian (bg), german (de), modern greek (el), english (en), spanish (es), french (fr), hindi (hi), italian (it), japanese (ja), dutch (nl), polish (pl), portuguese (pt), russian (ru), swahili (sw), thai (th), turkish (tr), urdu (ur), vietnamese (vi), and chinese (zh)')

demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7863/
Running on public URL: https://35899.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<gradio.routes.App at 0x1f81784c730>,
 'http://127.0.0.1:7863/',
 'https://35899.gradio.app')