### Install Packages

In [None]:
!pip install scikit-learn
!pip install pandas
!pip install nltk

In [140]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
import pickle
import json
import pandas as pd
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

### Setup Cleaning Process

In [153]:
stopword_list = set(stopwords.words('german'))
stemmer = SnowballStemmer("english")

def clean_text(text):
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stopword_list)
    text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

### Select Content that should be used to classify data

In [154]:

# we need to choose the data that can be used to classify the data
data_for_classification = [["content","content_trafilatura_big_stemmed.json"],["title","title_big_stemmed.json"]]

# the data that was chosen to be used as training data will be stored in this variable called "used_data"
# 0 = title of the websites  // 1 = content of the websites
used_data = data_for_classification[1]
undersampling = False
oversampling = True

'''
Welche Parameter nutzen?

Für Content: [0], undersampling = False, oversampling = True
Für Title: [1], undersampling = False, oversampling = True


'''

'\nWelche Parameter nutzen?\n\nFür Content: [0], undersampling = False, oversampling = True\nFür Title: [1], undersampling = False, oversampling = True\n\n\n'

### Load assessments-Text-File

In [155]:
# load assessments
file_path = "assessments_with_bl.txt"
with open(file_path) as f:
    assessments = f.readlines()
print(len(assessments))

4470


In [156]:
# transform assessments into an array
assessments = [x.strip() for x in assessments]

# delete header
assessments.pop(0)

# store assessments, that do not occur more than once
assessments_without_dublicates = []

# store ids that are allready in assessments_without_dublicates
doc_ids = set()

# fill assessments_without_dublicates
for doc in assessments:
  if (doc.split(' ')[2] in doc_ids) is False:
     assessments_without_dublicates.append(doc)
     doc_ids.add(doc.split(' ')[2])

# so finally after this iteration the assessments array does only consist of individual documents
assessments = assessments_without_dublicates

# we don't want to use the test data for training 
# so we need to delete these documents from the training data
with open("test_document_ids.txt","r") as f:
    test_document_ids = [line.rstrip() for line in f]

# we will store the ids of the assessments that are also part of the training data
duplicate_indexes = []

# iteration to see which ids of the assessments are also part of the training data
i = 0
for d in assessments:
  for igd in test_document_ids:
    if d.split(' ')[2] == igd:
       duplicate_indexes.append(i)
  i += 1

# each assessment that is not part of the training data will now be stored in this array
assessments_with_removal = []

# iteration to see which assessment  is not part of the training data
for i in range(len(assessments)):
    if not i in duplicate_indexes:
        assessments_with_removal.append(assessments[i])

# so finally after this iteration the assessments array does only consist of documents that aren't used for testing
assessments = assessments_with_removal

print(len(assessments))

4092


### Create Arrays with correct and not correct documents

In [157]:
correct_documents = []
not_correct_documents = []

# load the content data
with open(used_data[1]) as json_file:
    content_data = json.load(json_file)

# create index variable to print progress in console
index = 0

# add the content of each assessment either to the array of correct_documents or not_correct_documents
for doc in assessments:
  index += 1
  try:
    # extract the content of an id
    content = content_data[doc.split(' ')[2]]

    # optional:
    #content = clean_text(content)

    # store the correctness assessment in a variable
    correctness = doc.split(' ')[3]

    # store the content in one of the two arrays
    if correctness == "1" and content != None and correctness!="0": # yes
        correct_documents.append(content)
    if correctness == "2" and content != None and correctness!="0": # no
        not_correct_documents.append(content)
    if correctness != "0" and content != None:
        print("added: ",index, doc.split(' ')[2], correctness)
    else:
        print("error: ", index, doc.split(' ')[2], correctness)
  except:
    print("error: ",doc.split(' ')[2])
    pass

print(len(correct_documents), len(not_correct_documents))

a1-8c57-68c8e94eb333 1
added:  3707 b7176a74-18ef-4269-a821-848f5d3c140c 1
added:  3708 bb64dc0f-18cf-4e29-b66b-0a7df20ee2b6 1
added:  3709 bd6bbd11-07a9-4344-ada0-197b33ea8b24 1
added:  3710 bf70bef4-2939-4833-b274-c47a8946d858 1
added:  3711 c9c9f069-43ae-4774-bbdd-0e9653314cae 1
added:  3712 ce68a077-e078-4fc8-b422-5ea05c435804 1
added:  3713 d75512f1-de35-421d-b51f-6a5a0d31f044 1
added:  3714 d7c5cd60-6304-4261-9780-b28c8d6470e5 1
added:  3715 d8925b00-0423-4857-a854-59e811f0dea3 1
added:  3716 f9f88f0b-0cee-418a-b5bd-ea73798dede7 1
added:  3717 ff441a4c-d02b-4a58-86db-443afa5b82ed 1
added:  3718 199ee59f-3a1c-468d-b4fe-44f6c312f88d 1
added:  3719 37bb66bc-0c8a-40ba-9854-0546318c7a46 1
added:  3720 461a96f8-ffda-4f1e-858e-f59f8c524c2f 1
added:  3721 5b3b6b2c-e718-44b7-9d26-31862ded9866 1
added:  3722 7a3d08eb-03c4-4669-a80f-9f8791649532 1
added:  3723 7da8f706-b9d2-4f54-93c0-0d85ad814b2c 1
added:  3724 927af965-21f7-4ed7-9205-3b5d9db180bb 1
added:  3725 94888d55-c015-4c69-8d4e-1d1b

### Optional: Use Oversampling / Undersampling

In [158]:
def balance_sample(undersampling=False, oversampling=False):
    if undersampling == True:
        while len(not_correct_documents) > len(correct_documents):
            not_correct_documents.pop(random.randrange(len(not_correct_documents)))
        print("Amount of correct | not correct data after undersampling: ", len(correct_documents), len(not_correct_documents))
    if oversampling == True:
        while len(not_correct_documents)!=len(correct_documents):
            correct_documents.append(random.choice(correct_documents))
        print("Amount of correct | not correct data after oversampling: ", len(correct_documents), len(not_correct_documents))

balance_sample(undersampling,oversampling)

Amount of correct | not correct data after oversampling:  2026 2026


### Convert these two arrays into data that is understandable for sklearn (Pandas Array)

In [159]:
# We will now create two arrays: one for the labels of each content and one with the content itself
labeled_data = []

for content in not_correct_documents:
    labeled_data.append([content,"fake"])
for content in correct_documents:
    labeled_data.append([content,"real"])

df = pd.DataFrame.from_records(labeled_data)

# After we created these pairs, we can store them in normal arrays
txt = df[0].tolist()
labels = df[1].tolist()

### Evaluate a classifier trained on the training data (5 Folds)

In [160]:
# Create tf-idf Converter, trained on the training data
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
txt = tfidfconverter.fit_transform(txt)
txt = txt.toarray()

# Used classifier: SVC
classifier = svm.SVC(decision_function_shape='ovo', probability=True)
classifier.fit(txt, labels)

scores = cross_val_score(classifier, txt, labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2), scores)

Accuracy: 0.71 (+/- 0.17) [0.74106042 0.69173859 0.63703704 0.62716049 0.8617284 ]


### Store results of classification

In [161]:
# topics of the documents in correct order
topics_y = [9,9,9,9,9,9,9,9,20,20,20,20,20,20,20,20,25,25,25,25,25,25,25,25,30,30,30,30,30,30,30,30,47,47,47,47,47,47,47,47,49,49,49,49,49,49,49,49]

# correct labels of the test_data
test_y = ["fake","fake","real","fake","fake","real","real","real","real","real","fake","real","real","fake","fake","fake","fake","real","real","fake","fake","real","real","fake","fake","fake","fake","real","real","real","fake","real","real","fake","fake","real","real","fake","fake","real","real","real","fake","fake","real","real","fake","fake"]

test_X = []
for doc in test_document_ids:
  test_X.append(content_data[doc])

# these variables are used to calculate the accuracy
index = 0
correct = 0

# this variable will be used to store the predictions to export it to json
predictions = {}

for x in test_X:
    print("ass: / cltf: ",test_y[index],classifier.predict(tfidfconverter.transform([x]).toarray())[0],"clf_pr: ",max(classifier.predict_proba(tfidfconverter.transform([x]).toarray()).tolist()[0]))
    predictions[test_document_ids[index]] = {"topic":topics_y[index],"prediction":classifier.predict(tfidfconverter.transform([x]).toarray())[0],"certainity": max(classifier.predict_proba(tfidfconverter.transform([x]).toarray().tolist())[0])}
    if classifier.predict(tfidfconverter.transform([x]).toarray())[0] == test_y[index]:
        correct += 1
    index += 1

print(correct/index)

with open(used_data[0]+"_predictions"+".json", 'w') as outfile:
    json.dump(predictions, outfile)

ass: / cltf:  fake fake clf_pr:  0.8124570353429367
ass: / cltf:  fake real clf_pr:  0.6246458245157465
ass: / cltf:  real fake clf_pr:  0.65327416703563
ass: / cltf:  fake real clf_pr:  0.7182410324464337
ass: / cltf:  fake real clf_pr:  0.7502390223331404
ass: / cltf:  real real clf_pr:  0.968444459616774
ass: / cltf:  real real clf_pr:  0.9769294815751645
ass: / cltf:  real real clf_pr:  0.656915567420694
ass: / cltf:  real fake clf_pr:  0.5637103068227394
ass: / cltf:  real fake clf_pr:  0.8532794571585494
ass: / cltf:  fake fake clf_pr:  0.9693289445587189
ass: / cltf:  real real clf_pr:  0.7627528459181944
ass: / cltf:  real real clf_pr:  0.8932453287593773
ass: / cltf:  fake fake clf_pr:  0.9911190585571944
ass: / cltf:  fake fake clf_pr:  0.6182099984716911
ass: / cltf:  fake fake clf_pr:  0.9693709807826436
ass: / cltf:  fake fake clf_pr:  0.7829155957664373
ass: / cltf:  real real clf_pr:  0.9303596663997448
ass: / cltf:  real real clf_pr:  0.960772196424845
ass: / cltf:  fak