In [1]:
import json
import nltk
import spacy
import numpy as np
import requests
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

!pip install https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.1.0/nb_core_news_sm-3.1.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.1.0/nb_core_news_sm-3.1.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/nb_core_news_sm-3.1.0/nb_core_news_sm-3.1.0.tar.gz (16.1 MB)
                                              0.0/16.1 MB ? eta -:--:--
     -                                        0.4/16.1 MB 13.9 MB/s eta 0:00:02
     ---                                      1.6/16.1 MB 20.1 MB/s eta 0:00:01
     ------                                   2.7/16.1 MB 21.9 MB/s eta 0:00:01
     ---------                                3.7/16.1 MB 21.7 MB/s eta 0:00:01
     ------------                             5.2/16.1 MB 23.8 MB/s eta 0:00:01
     ----------------                         6.5/16.1 MB 24.6 MB/s eta 0:00:01
     ----------------                         6.8/16.1 MB 21.7 MB/s eta 0:00:01
     ------------------                       7.5/16.1 MB 20.9 MB/s eta 0:00:01
     ---------------------     

In [2]:
def data_analysis(j_data) -> list:
    msg_data = []

    keys = set()
    print("Data size:", len(j_data))

    for item in j_data:
        k = item.keys()
        keys.update(k)
        msg_data.append((item['text'], item['label']))

    print(keys)

In [3]:
train_url = "https://raw.githubusercontent.com/ltgoslo/norec_sentence/main/binary/train.json"
test_url = "https://raw.githubusercontent.com/ltgoslo/norec_sentence/main/binary/test.json"

train_resp = requests.get(train_url)
test_resp = requests.get(test_url)

train_resp.raise_for_status()
test_resp.raise_for_status()

json_train_data = json.loads(train_resp.text)
json_test_data = json.loads(test_resp.text)

data_analysis(json_train_data)
data_analysis(json_test_data)

Data size: 3894
{'sent_id', 'text', 'label'}


In [4]:
def data_read(j_data) -> list:
    msg_data = [[category["text"], category["label"]] for category in j_data]

    return msg_data

In [5]:
def pre_Processing(data):
    sentences = []
    sentiments = []
    lemmatizer = spacy.load("nb_core_news_sm")  # Norwegian lemmatization model.

    for batch in data:
        text, sentiment = batch[0], batch[1]
        pre_lemmatized = lemmatizer(text)

        lemmatized = [root_word.lemma_.lower() for root_word in pre_lemmatized
                      if (not root_word.is_punct
                          and not root_word.is_currency
                          and not root_word.is_digit
                          and not root_word.is_space
                          and not root_word.is_stop
                          and not root_word.like_num)]

        sentence = ' '.join(lemmatized)
        sentences.append(sentence)
        sentiments.append(sentiment)

    return sentences, sentiments

In [6]:
# Split the dataset into training and testing sets
X_train, y_train = pre_Processing(data_read(json_train_data))
X_test, y_test = pre_Processing(data_read(json_test_data))

# Vectorize the text data (choose either BoW or TF-IDF)
vectorizer = CountVectorizer()  # or TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train and evaluate the classifiers
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}

for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_vec, y_train)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test_vec)
    
    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"{name}:\nAccuracy: {accuracy}\n{report}\n")

Naive Bayes:
Accuracy: 0.7118353344768439
              precision    recall  f1-score   support

    Negative       0.58      0.27      0.37       182
    Positive       0.73      0.91      0.81       401

    accuracy                           0.71       583
   macro avg       0.66      0.59      0.59       583
weighted avg       0.69      0.71      0.68       583


Logistic Regression:
Accuracy: 0.7152658662092625
              precision    recall  f1-score   support

    Negative       0.59      0.28      0.38       182
    Positive       0.74      0.91      0.82       401

    accuracy                           0.72       583
   macro avg       0.66      0.60      0.60       583
weighted avg       0.69      0.72      0.68       583


Decision Tree:
Accuracy: 0.6380789022298456
              precision    recall  f1-score   support

    Negative       0.41      0.35      0.38       182
    Positive       0.72      0.77      0.74       401

    accuracy                           0.64 