In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, SGDRegressor
pd.options.display.max_colwidth = 0

dataset = pd.read_csv("/content/drive/MyDrive/Master Thesis/annotation_set.csv")
dataset["article"] = dataset["article"].str.replace("\[[0-9]\]", "", regex=True)
train = dataset[0:120]
dev = dataset[120:135]
test = dataset[135:150]

text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', RidgeCV())])

text_clf = text_clf.fit(train["article"], train["avg_score"])
pred = text_clf.predict(test["article"])

mae = metrics.mean_absolute_error(test["avg_score"], pred)
mse = metrics.mean_squared_error(test["avg_score"], pred)
r2 = metrics.r2_score(test["avg_score"], pred)
print(mae)
print(mse)
print(r2)

87
13
8
12
0.9855619310624435
1.7642251684688905
0.08235497694621452


In [None]:
print(train["avg_score"].between(1, 1.99).sum())
print(train["avg_score"].between(2, 2.99).sum())
print(train["avg_score"].between(3, 3.99).sum())
print(train["avg_score"].between(4, 5).sum())

print(dev["avg_score"].between(1, 1.99).sum())
print(dev["avg_score"].between(2, 2.99).sum())
print(dev["avg_score"].between(3, 3.99).sum())
print(dev["avg_score"].between(4, 5).sum())

print(test["avg_score"].between(1, 1.99).sum())
print(test["avg_score"].between(2, 2.99).sum())
print(test["avg_score"].between(3, 3.99).sum())
print(test["avg_score"].between(4, 5).sum())

87
13
8
12
10
2
2
1
10
2
1
2


In [None]:
from sklearn.dummy import DummyRegressor

dummy_clf = DummyRegressor(strategy="mean")

dummy_clf = dummy_clf.fit(train["article"], train["avg_score"])
pred = dummy_clf.predict(test["article"])

mae = metrics.mean_absolute_error(dev["avg_score"], pred)
mse = metrics.mean_squared_error(test["avg_score"], pred)
r2 = metrics.r2_score(test["avg_score"], pred)
print(mae)
print(mse)
print(r2)

0.8433611111111112
1.9379126736111112
-0.007986934907764143


In [None]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

sent_dataset = pd.read_csv("/content/drive/MyDrive/Master Thesis/annotation_set_sent.csv")
sent_dataset["sentence"] = sent_dataset["article"].str.split("\n")
sent_dataset = sent_dataset.explode("sentence")
sent_dataset["sent_score"] = sent_dataset["article"].str.extract(r"\[([0-9])\]")
train = sent_dataset[0:1200]
dev = sent_dataset[1200:1350]
test = sent_dataset[1350:1500]

text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', MultinomialNB())])

text_clf = text_clf.fit(train["sentence"], train["sent_score"])
pred = text_clf.predict(test["sentence"])

print(metrics.classification_report(test["sent_score"], pred, digits=3))

1    579
0    442
5    109
3    31 
4    20 
2    19 
Name: sent_score, dtype: int64
1    100
5    20 
0    20 
4    10 
Name: sent_score, dtype: int64
1    100
5    30 
0    20 
Name: sent_score, dtype: int64
              precision    recall  f1-score   support

           0      0.364     0.600     0.453        20
           1      0.744     0.870     0.802       100
           5      0.000     0.000     0.000        30

    accuracy                          0.660       150
   macro avg      0.369     0.490     0.418       150
weighted avg      0.544     0.660     0.595       150



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(train["sent_score"].value_counts())
print(dev["sent_score"].value_counts())
print(test["sent_score"].value_counts())

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf = dummy_clf.fit(train["sentence"], train["sent_score"])
pred = dummy_clf.predict(test["sentence"])
print(metrics.classification_report(test["sent_score"], pred, digits=3))

              precision    recall  f1-score   support

           0      0.000     0.000     0.000        20
           1      0.667     1.000     0.800       100
           5      0.000     0.000     0.000        30

    accuracy                          0.667       150
   macro avg      0.222     0.333     0.267       150
weighted avg      0.444     0.667     0.533       150



  _warn_prf(average, modifier, msg_start, len(result))
