In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/Anonote/finalized_data/Sinhala_Comment_v1.csv')

In [4]:
df.head()

Unnamed: 0,comment,positive/negative
0,දිනිති ගේ එකටකාරකමට මම හරි කැමකියි එයා ගොඩක්...,POSITIVE
1,මොනා උනත් මම නම් හරි ආස නිලියක් මෙයා දැන් ස්වර...,POSITIVE
2,මෙහෙමයි ලොකේ එක එක විදියේ මිනිස්සු ඉන්නවාඉතිම්...,POSITIVE
3,මං ආදරේ කරන කෙල්ලෙක් ඒකි කියන්න ඕන දේ කෙලින් ක...,POSITIVE
4,හරිම දක්ෂ කාන්තාවක් ආඩම්බර වෙන්න බොහෝ තියෙනවා ...,POSITIVE


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   comment            307 non-null    object
 1   positive/negative  307 non-null    object
 2   P/N                307 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 7.3+ KB


In [10]:
df['P/N'] = df['positive/negative'].map({'POSITIVE' : 1, 'NEGATIVE' : 0})

In [11]:
df['P/N'].value_counts()

Unnamed: 0_level_0,count
P/N,Unnamed: 1_level_1
0,196
1,111


There is a value Conut difference but I do not want to remove dataa, I will add more data and make them close in future

In [12]:
import spacy

In [13]:
nlp = spacy.blank('si')

In [14]:
from collections import Counter

def most_repeated_words(texts):

  word_counts = Counter()

  for text in texts:
    doc = nlp(text)
    words = [token.text for token in doc]
    word_counts.update(words)

  for x in range(20):
    print(word_counts.most_common(20)[x])

In [15]:
most_repeated_words(df['comment'])

('ලස්සනයි', 50)
('මේ', 27)
('ගොන්', 25)
('නම්', 20)
('පිස්සු', 20)
('හරිම', 18)
('වගේ', 16)
('ඔයාට', 14)
('එක', 14)
('දිනිති', 13)
(',', 13)
('අනේ', 13)
('නෑ', 13)
('හරි', 11)
('ලස්සන', 11)
('ඔයා', 11)
('කියලා', 11)
('මම', 10)
('නැහැ', 9)
('ඉන්න', 9)


In [16]:
def remove_punctuations(text):
  doc = nlp(text)
  words = []
  for token in doc:
    if(not token.is_punct):
      words.append(token.text)

  return ' '.join(words)

In [17]:
df['comment'] = df['comment'].apply(remove_punctuations)

In [18]:
# text has already cleared using spacy at the collecting level

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df['comment'],
    df['P/N'],
    test_size=0.2,
    random_state=42,
    stratify=df['P/N'],
)

In [20]:
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93        40
           1       0.94      0.77      0.85        22

    accuracy                           0.90        62
   macro avg       0.92      0.87      0.89        62
weighted avg       0.91      0.90      0.90        62



In [21]:
clf.predict(['ඔයා හරිම හොද ලමයෙක් සුලිත අපි ඔයාගෙ අනාගතය ට සුබ පතනවා'])

array([1])

In [22]:
clf.predict(['ඔයා හරිම කැතයි. කවද හරි කන්නඩියෙන් මුන බලල තියෙනවද'])

array([0])

In [23]:
clf.predict(['කවද හරි කන්නඩියෙන් මුන බලල තියෙනවද'])

array([0])

In [24]:
path = '/content/drive/MyDrive/Dataset/Anonote/models/sinhala_comment_classifier_v1.pkl'

with open(path, 'wb') as file:
    pickle.dump(clf, file)

In [26]:
from sklearn.linear_model import LogisticRegression
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90        40
           1       1.00      0.59      0.74        22

    accuracy                           0.85        62
   macro avg       0.91      0.80      0.82        62
weighted avg       0.88      0.85      0.84        62



This Results wores than the Binomial model