1. Install Required Libraries

In [1]:
!pip install pandas scikit-learn nltk indic-nlp-library




[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.1.0-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=5.1.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-9.1.0-py3-none-any.whl.metadata (5.8 kB)
Collecting docutils>=0.19 (from sphinx-argparse->indic-nlp-library)
  Downloading docutils-0.22.4-py3-none-any.whl.metadata (15 kB)
Collecting sphinxcontrib-applehelp>=1.0.7 (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_applehelp-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-devhelp>=1.0.6 (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-libra

2. Import Libraries

In [2]:
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

3. Create Sample Hindi Dataset

In [3]:
data = {
    "text": [
        "यह फिल्म बहुत अच्छी है",
        "मुझे यह गाना पसंद आया",
        "यह खाना बेकार है",
        "मुझे यह मोबाइल बिल्कुल पसंद नहीं है",
        "सेवा बहुत शानदार थी",
        "यह अनुभव बहुत खराब था"
    ],
    "label": ["positive", "positive", "negative", "negative", "positive", "negative"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,यह फिल्म बहुत अच्छी है,positive
1,मुझे यह गाना पसंद आया,positive
2,यह खाना बेकार है,negative
3,मुझे यह मोबाइल बिल्कुल पसंद नहीं है,negative
4,सेवा बहुत शानदार थी,positive
5,यह अनुभव बहुत खराब था,negative


4. Text Preprocessing (Cleaning Hindi Text)

In [4]:
def clean_text(text):
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # keep Hindi characters only
    text = text.lower()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)
df


Unnamed: 0,text,label,cleaned_text
0,यह फिल्म बहुत अच्छी है,positive,यह फिल्म बहुत अच्छी है
1,मुझे यह गाना पसंद आया,positive,मुझे यह गाना पसंद आया
2,यह खाना बेकार है,negative,यह खाना बेकार है
3,मुझे यह मोबाइल बिल्कुल पसंद नहीं है,negative,मुझे यह मोबाइल बिल्कुल पसंद नहीं है
4,सेवा बहुत शानदार थी,positive,सेवा बहुत शानदार थी
5,यह अनुभव बहुत खराब था,negative,यह अनुभव बहुत खराब था


5. Convert Text to TF-IDF Features

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']


6. Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


7. Apply Multinomial Naive Bayes

In [7]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


8. Evaluate Model

In [8]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

    negative       0.00      0.00      0.00       0.0
    positive       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


9. Test on New Hindi Sentence

In [9]:
new_sentence = ["यह फिल्म शानदार और बहुत अच्छी है"]
new_clean = [clean_text(new_sentence[0])]
new_vector = vectorizer.transform(new_clean)

prediction = model.predict(new_vector)
print("Sentence:", new_sentence[0])
print("Predicted Sentiment:", prediction[0])


Sentence: यह फिल्म शानदार और बहुत अच्छी है
Predicted Sentiment: negative
