In [9]:
!pip install textstat
!pip install pattern
import textstat
import string 
import re
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from pattern.nl import sentiment

textstat.set_lang("nl")
train = pd.read_csv("/content/drive/MyDrive/Master Thesis/train_lilah.csv")
dev = pd.read_csv("/content/drive/MyDrive/Master Thesis/dev_lilah.csv")
test = pd.read_csv("/content/drive/MyDrive/Master Thesis/test_lilah.csv")

def cap_count(article):
    return len(re.findall(r'[A-Z]', article))

train["cap_count"] = train["article"].apply(cap_count)
dev["cap_count"] = dev["article"].apply(cap_count)
test["cap_count"] = test["article"].apply(cap_count)

def punc_count(article):
    return(len(re.findall(r'[{0}]'.format(string.punctuation), article))) 

train["punc_count"] = train["article"].apply(punc_count)
dev["punc_count"] = dev["article"].apply(punc_count)
test["punc_count"] = test["article"].apply(punc_count)

def add_readability(article):
  return textstat.flesch_reading_ease(article)

train["readability"] = train["article"].apply(add_readability)
dev["readability"] = dev["article"].apply(add_readability)
test["readability"] = test["article"].apply(add_readability)

def apply_polarity(article):
    return sentiment(article)[0]

train["polarity"] = train["article"].apply(apply_polarity)
dev["polarity"] = dev["article"].apply(apply_polarity)
test["polarity"] = test["article"].apply(apply_polarity)

def apply_sentiment(article):
    return sentiment(article)[1]

train["sentiment"] = train["article"].apply(apply_sentiment)
dev["sentiment"] = dev["article"].apply(apply_sentiment)
test["sentiment"] = test["article"].apply(apply_sentiment)

train["no_unique_words"] = train["article"].str.split().apply(set).apply(len) / train["article"].str.split().apply(len)
dev["no_unique_words"] = dev["article"].str.split().apply(set).apply(len) / dev["article"].str.split().apply(len)
test["no_unique_words"] = test["article"].str.split().apply(set).apply(len) / test["article"].str.split().apply(len)

train["article_length"] = train["article"].str.split().apply(len)
dev["article_length"] = dev["article"].str.split().apply(len)
test["article_length"] = test["article"].str.split().apply(len)

def excl_count(article):
  return len(re.findall(r'!', article))

train["excl_count"] = train["article"].apply(excl_count)
dev["excl_count"] = dev["article"].apply(excl_count)
test["excl_count"] = test["article"].apply(excl_count)

def s_quote_count(article):
  return len(re.findall(r"'", article))

train["s_quote_count"] = train["article"].apply(s_quote_count)
dev["s_quote_count"] = dev["article"]. apply(s_quote_count)
test["s_quote_count"] = test["article"]. apply(s_quote_count)

def d_quote_count(article):
  return len(re.findall(r'"', article))

train["d_quote_count"] = train["article"].apply(d_quote_count)
dev["d_quote_count"] = dev["article"].apply(d_quote_count)
test["d_quote_count"] = test["article"].apply(d_quote_count)

def all_caps_count(article):
  return len(re.findall(r"\b[A-Z]+\b", article))

train["all_caps_count"] = train["article"].apply(all_caps_count)
dev["all_caps_count"] = dev["article"].apply(all_caps_count)
test["all_caps_count"] = test["article"].apply(all_caps_count)




In [10]:
column_transformer = ColumnTransformer(
    [("art_tfidf", TfidfVectorizer(), "article"),
     ("feature_1", StandardScaler(), ["s_quote_count"]),
     ("feature_2", StandardScaler(), ["pos_count"]),
     ("feature_3", StandardScaler(), ["d_quote_count"]),
     ("feature_4", StandardScaler(), ["cap_count"])], remainder="drop"
)

text_clf = Pipeline([('features', column_transformer),
                     ('clf', LinearSVC(C=2.0, loss="hinge"))])

text_clf = text_clf.fit(train, train["trust"])
pred = text_clf.predict(test)
print(classification_report(test["trust"], pred, digits=3))



              precision    recall  f1-score   support

     trusted      0.835     0.878     0.856      1250
   untrusted      0.872     0.826     0.848      1250

    accuracy                          0.852      2500
   macro avg      0.853     0.852     0.852      2500
weighted avg      0.853     0.852     0.852      2500

