In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from stopwordsiso import stopwords

In [4]:
import stanza
stanza.download('mr')

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.1.json: 365kB [00:00, 9.83MB/s]
2023-09-28 12:34:19 INFO: Downloading default packages for language: mr (Marathi) ...
Downloading https://huggingface.co/stanfordnlp/stanza-mr/resolve/v1.5.1/models/d
2023-09-28 12:36:33 INFO: Finished downloading models and saved to /home/vaxraxd/stanza_resources.


In [2]:
import pandas as pd

df=pd.read_csv("../dataset/marathi.csv")
X = df.iloc[:, 0]
y = df.iloc[:, 1]

In [3]:
import re

def remove_emojis(data):
    emoj = re.compile("["
        u"\U00002700-\U000027BF"  # Dingbats
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def remove_unicode_chars(data):
    pattern = re.compile(u"[\u200c-\u200f\u202a-\u202f\u2066-\u2069]")
    return pattern.sub("", data)

In [4]:
def Preprocessing_for_Marathi_Language(marathi_text):

  #Remove Emojis
  marathi_text=remove_emojis(marathi_text)

  #Removing Punctuations
  punctuations = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
  punctuation_removed_text = marathi_text
  for ele in marathi_text:
      if ele in punctuations:
          punctuation_removed_text = punctuation_removed_text.replace(ele, " ")

  #Tokenization
  tokenized_text = punctuation_removed_text.split(" ")

  #Remove Spaces
  tokenized_text=list(filter(("").__ne__,tokenized_text))
  # print(tokenized_text)

  #Filter only marathi words
  final_words=list()
  for word in tokenized_text:
    word=remove_unicode_chars(word)
    if len(word)==0:
      continue
    if any(char.isdigit() for char in word):
      continue
    if not ('a' <= word[0] <= 'z' or 'A' <= word[0] <= 'Z' or word[0].isdigit() or '\n' in word):
      final_words.append(word)
  final_marathi_text=" ".join(final_words)

  #Removing Stopwords
  stopwords_removed_text=list()
  stopwords_collection = stopwords('mr')
  for i in final_words:
      if i not in stopwords_collection:
          stopwords_removed_text.append(i)

  #Remove Spaces
  final_list=list()
  for token in stopwords_removed_text:
    if token!="":
      final_list.append(token)
  return " ".join(final_list)

In [7]:
for i in range(len(X)):
  X[i]=Preprocessing_for_Marathi_Language(X[i])

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [10]:
vectorizer = TfidfVectorizer(decode_error="ignore")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

In [12]:
y_pred = clf.predict(X_test_tfidf)
print(accuracy_score(y_test,y_pred))

0.6860349950478706


In [15]:
!pip install joblib
import joblib 
joblib.dump(clf, 'sent_analysis.pkl')

Defaulting to user installation because normal site-packages is not writeable


['sent_analysis.pkl']

In [13]:
import numpy as np

def analyzeSentiment(text):
  data = np.array([Preprocessing_for_Marathi_Language(text)])
  news = pd.Series(data)
  a_pred = clf.predict(vectorizer.transform(news))
  print(a_pred)

analyzeSentiment("  बंगळूरु परिवहन सेवेला #Budget2020 मध्ये  वीस टक्के भागभांडवल दिलं जातं पण,मुंबई, पुणे, नागपूरच्या परिवहन सेवेचा साधा उल्लेखही नाही. गुजरातच्या गिफ्ट सिटीमध्ये आंतरराष्ट्रीय सोने-चांदी बाजाराची घोषणा होते, हे सारं मुंबई आणि महाराष्ट्राचं महत्त्व कमी करण्यासाठी तर नाही ना?")

[-1]
