In [2]:
#Install required libraries
!pip install gradio sklearn-crfsuite indic-nlp-library --quiet

In [3]:
import gradio as gr
import urllib.request
import os
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from sklearn_crfsuite import CRF
from nltk import word_tokenize, pos_tag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
hindi_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Hindi-HDTB/master/hi_hdtb-ud-train.conllu"
english_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu"
hindi_path = "hindi_pos_data.conllu"
english_path = "english_pos_data.conllu"

if not os.path.exists(hindi_path):
    urllib.request.urlretrieve(hindi_url, hindi_path)

if not os.path.exists(english_path):
    urllib.request.urlretrieve(english_url, english_path)


In [5]:
#Load data
def read_conllu(file_path):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.startswith("#") or line.strip() == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue
            parts = line.strip().split("\t")
            if len(parts) > 3:
                word, pos_tag = parts[1], parts[3]
                sentence.append((word, pos_tag))
    return sentences

hindi_sentences = read_conllu(hindi_path)
english_sentences = read_conllu(english_path)

In [6]:
#Feature extraction
def word_features(sentence, index):
    word = sentence[index][0]
    features = {
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': word[0].isupper(),
        'is_numeric': word.isdigit(),
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
    }
    return features

def extract_features(sentences):
    return [[word_features(sentence, i) for i in range(len(sentence))] for sentence in sentences]

def extract_labels(sentences):
    return [[pos for _, pos in sentence] for sentence in sentences]

X_hi = extract_features(hindi_sentences)
y_hi = extract_labels(hindi_sentences)
X_en = extract_features(english_sentences)
y_en = extract_labels(english_sentences)

In [7]:
#Train Hindi & English CRF models
crf_hi = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf_hi.fit(X_hi, y_hi)

crf_en = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf_en.fit(X_en, y_en)

In [8]:
# Prediction function
def tag_sentence(text, language):
    if not text.strip():
        return "Please enter some text."

    words = text.strip().split()
    sentence = [(word, "") for word in words]
    features = [word_features(sentence, i) for i in range(len(sentence))]

    if language == "Hindi":
        predicted_tags = crf_hi.predict_single(features)
    else:
        predicted_tags = crf_en.predict_single(features)

    return "\n".join([f"{w} → {t}" for w, t in zip(words, predicted_tags)])

In [11]:
# CSS styling
custom_css = """

footer {
    display: none !important;
}


/* Set the background color to white */
body {
    background-color: white;
    font-family: Arial, sans-serif;
    margin: 0;
    padding: 0;
}

/* Style for the button */
button {
    background-color: blue;
    color: white;
    border: none;
    padding: 10px 20px;
    font-size: 16px;
    cursor: pointer;
    border-radius: 5px;
    margin-top: 10px;
}

button:hover {
    background-color: darkblue;
}

/* Input and output fields stacked vertically */
.input-output-container {
    display: flex;
    flex-direction: column;
    gap: 10px;
    margin: 20px;
}

/* Styling for input fields */
input {
    padding: 10px;
    font-size: 16px;
    border: 1px solid #ccc;
    border-radius: 5px;
    width: 100%;
}

/* Adding margin between input/output fields */
.input-output-container input, .input-output-container select {
    margin-top: 10px;
}


"""

In [13]:
#Launch Gradio Interface
gr.Interface(
    fn=tag_sentence,
    inputs=[
        gr.Textbox(lines=2, label="Enter Input"),
        # gr.Dropdown(choices=["English", "Hindi"], label="Select Language", value="English")
    ],
    outputs=gr.Textbox(label="This is Output"),
    title="Multilingual POS Tagger",
    description="Enter a sentence and select a language to get POS tags",
    css=custom_css,
    theme="default"
).launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6046c84c309eedf6b5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


