In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import pickle

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/Anonote/finalized_data/Singlish_Comments_v2.csv')

In [None]:
df.head()

Unnamed: 0,Comment,positive/negative
0,Anee eya hari hodai,POSITIVE
1,Oya nam mara athal akki,POSITIVE
2,Api nihathamaniwa jiwath wenna ona,POSITIVE
3,Ane manda monawa kiyannada kiyala,POSITIVE
4,respect karanawa,POSITIVE


In [None]:
df['P/B'] = df['positive/negative'].map({"POSITIVE" : 1, "NEGATIVE" : 0})
df['P/B'].value_counts()

Unnamed: 0_level_0,count
P/B,Unnamed: 1_level_1
0,252
1,151


In [None]:
# text has already cleared using spacy at the collecting level

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Comment'],
    df['P/B'],
    test_size=0.2,
    random_state=42,
    stratify=df['P/B'],
)

In [None]:
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91        51
           1       0.86      0.83      0.85        30

    accuracy                           0.89        81
   macro avg       0.88      0.88      0.88        81
weighted avg       0.89      0.89      0.89        81



In [None]:
print(clf.predict(["palayan yanna moda gani"]))

[0]


In [None]:
print(clf.predict(["uba wage moda harakek"]))

[0]


In [None]:
print(clf.predict(["oya harima hoda lamayek"]))

[1]


In [None]:
print(clf.predict(["oyawa mata penna ba"]))

[0]


In [None]:
print(clf.predict(["deshani oya nam harima hoda lamayek"]))

[1]


In [None]:
path = '/content/drive/MyDrive/Dataset/Anonote/models/singlish_comment_classifier_v2.pkl'

with open(path, 'wb') as file:
    pickle.dump(clf, file)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93        51
           1       0.96      0.80      0.87        30

    accuracy                           0.91        81
   macro avg       0.93      0.89      0.90        81
weighted avg       0.92      0.91      0.91        81



In [None]:
path = '/content/drive/MyDrive/Dataset/Anonote/models/singlish_comment_classifier_v3.pkl'

with open(path, 'wb') as file:
    pickle.dump(clf, file)