In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from conllu import parse
import os

def read_hindi_dataset(f_name):
    with open(f_name, encoding='utf-8') as f:
        data = f.read()
    sentences = parse(data)
    return sentences

def generate_sentence_features(sent):
    sent_features = []
    for word_idx, word_info in enumerate(sent):
        word_features = {}
        word_features["word_" + word_info['form']] = 1
        if word_idx != 0:
            word_features["left_word_" + sent[word_idx - 1]['form']] = 1
        if word_idx != len(sent) - 1:
            word_features["right_word_" + sent[word_idx + 1]['form']] = 1
        sent_features.append(word_features)
    return sent_features

def prep_data(sentences):
    all_features = []
    all_labels = []
    for sentence in sentences:
        sent_features = generate_sentence_features(sentence)
        assert len(sent_features) == len(sentence)
        all_features.extend(sent_features)
        all_labels.extend([word_info['upostag'] for word_info in sentence])
    return all_features, all_labels

def test_sentence(classifier, vectorizer, sentence):
    sentence_features = generate_sentence_features(sentence)
    print("Features for the input sentence:", sentence_features)
    
    X_sentence = vectorizer.transform(sentence_features)
    predictions = classifier.predict(X_sentence)
    
    return predictions



train_dataset_path = "C:\\Users\\madhuri\\Downloads\\Universal Dependencies 2.12\\ud-treebanks-v2.12\\ud-treebanks-v2.12\\UD_Hindi-HDTB\\hi_hdtb-ud-train.conllu"
print("Reading Hindi train dataset from:", train_dataset_path)


if not os.path.exists(train_dataset_path):
    raise FileNotFoundError(f"File not found: {train_dataset_path}")

train_sentences_hindi = read_hindi_dataset(train_dataset_path)


valid_dataset_path = "C:\\Users\\madhuri\\Downloads\\Universal Dependencies 2.12\\ud-treebanks-v2.12\\ud-treebanks-v2.12\\UD_Hindi-HDTB\\hi_hdtb-ud-test.conllu"
print("Reading Hindi valid dataset from:", valid_dataset_path)


if not os.path.exists(valid_dataset_path):
    raise FileNotFoundError(f"File not found: {valid_dataset_path}")

valid_sentences_hindi = read_hindi_dataset(valid_dataset_path)


train_features, train_labels = prep_data(train_sentences_hindi)
valid_features, valid_labels = prep_data(valid_sentences_hindi)


vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_features)
X_valid = vectorizer.transform(valid_features)


classifier = LinearSVC(C=1, verbose=1)
print("Training the classifier...")
classifier.fit(X_train, train_labels)


print("Making predictions on the valid set...")
valid_predictions = classifier.predict(X_valid)


valid_accuracy = accuracy_score(valid_labels, valid_predictions)
print("Classifier Accuracy on the Valid Set:", valid_accuracy)


user_input_sentence = "यह एशिया की सबसे बड़ी मस्जिदों में से एक है । इसे नवाब शाहजेहन ने बनवाया था । "
user_input_tokens = user_input_sentence.split()


dummy_sentence = [{'form': token} for token in user_input_tokens]


user_input_predictions = test_sentence(classifier, vectorizer, dummy_sentence)
print("Predictions for the input sentence:", user_input_predictions)
print("End of script")


Reading Hindi train dataset from: C:\Users\madhuri\Downloads\Universal Dependencies 2.12\ud-treebanks-v2.12\ud-treebanks-v2.12\UD_Hindi-HDTB\hi_hdtb-ud-train.conllu
Reading Hindi valid dataset from: C:\Users\madhuri\Downloads\Universal Dependencies 2.12\ud-treebanks-v2.12\ud-treebanks-v2.12\UD_Hindi-HDTB\hi_hdtb-ud-test.conllu
Training the classifier...




[LibLinear]Making predictions on the valid set...
Classifier Accuracy on the Valid Set: 0.9514817950889077
Features for the input sentence: [{'word_यह': 1, 'right_word_एशिया': 1}, {'word_एशिया': 1, 'left_word_यह': 1, 'right_word_की': 1}, {'word_की': 1, 'left_word_एशिया': 1, 'right_word_सबसे': 1}, {'word_सबसे': 1, 'left_word_की': 1, 'right_word_बड़ी': 1}, {'word_बड़ी': 1, 'left_word_सबसे': 1, 'right_word_मस्जिदों': 1}, {'word_मस्जिदों': 1, 'left_word_बड़ी': 1, 'right_word_में': 1}, {'word_में': 1, 'left_word_मस्जिदों': 1, 'right_word_से': 1}, {'word_से': 1, 'left_word_में': 1, 'right_word_एक': 1}, {'word_एक': 1, 'left_word_से': 1, 'right_word_है': 1}, {'word_है': 1, 'left_word_एक': 1, 'right_word_।': 1}, {'word_।': 1, 'left_word_है': 1, 'right_word_इसे': 1}, {'word_इसे': 1, 'left_word_।': 1, 'right_word_नवाब': 1}, {'word_नवाब': 1, 'left_word_इसे': 1, 'right_word_शाहजेहन': 1}, {'word_शाहजेहन': 1, 'left_word_नवाब': 1, 'right_word_ने': 1}, {'word_ने': 1, 'left_word_शाहजेहन': 1, 'right_word