In [1]:
import pandas as pd
import psycopg2
import numpy as np
import sklearn
# import spacy

In [2]:
# Read csv-file
filepath = '../Data_git_ignore/clean_corpus.csv'
# filepath = '../Data_git_ignore/clean-100k.csv'
df = pd.read_csv(filepath, index_col = [0])

# Select colums to use for ML
df = df[['type', 'content']]

df.head()

Unnamed: 0,type,content
4427,fake,How To Easily Understand The Difference Betwee...
5553,fake,Butter Chocolate Cheese Cake\n\n% of readers t...
768,fake,Chipotle E. coli O26 Outbreak Update – 19 in W...
58,fake,"U.S. Repeals Propaganda Ban, Spreads Governmen..."
3757,fake,Darwin and the Voyage: 11 ~ Elephants and Hors...


In [7]:
df['content'].describe()

count                                                 70220
unique                                                50505
top       Tor\n\nTor is an encrypted anonymising network...
freq                                                   5779
Name: content, dtype: object

In [4]:
### SELECT TRAIN, TEST, VALIDATE ###

# This can generate a dataset with random purmutation and a max size for each type(can be smaller if desired max is not possible)

# max size for type
max_size = 7500
# traning_set ratio - splits data into traning=ratio,  test and validate=(1-ratio)/2 ex. train=80%, test=10%, validate=10%
ratio=0.7
# Labels to include - ['fake', 'satire', 'bias', 'conspiracy', 'state', 'junksci', 'hate', 'clickbait', 'unreliable', 'political', 'reliable'] - all labels
use_types = ['fake', 'satire', 'bias', 'conspiracy', 'junksci', 'hate', 'clickbait', 'unreliable', 'political', 'reliable']
# Random seed
rnd = 1

# initialize dataframes
train    = pd.DataFrame(columns = df.columns)
test     = pd.DataFrame(columns = df.columns)
validate = pd.DataFrame(columns = df.columns)

# add type to test splits
for t in use_types:

    # type size
    type_size = df['type'].loc[df['type'] == t].value_counts().min()

    # set size of type slice
    if type_size < max_size:
        tmp = df.loc[df['type'] == t].sample(n = type_size, random_state=rnd)
    else:
        tmp = df.loc[df['type'] == t].sample(n = max_size, random_state=rnd)

    # split current type
    train_tmp, test_tmp, validate_tmp = np.split(tmp, [int(ratio * len(tmp)), int(((1-ratio)/2 + ratio) * len(tmp))])

    # add tmp to dataframes
    train    = pd.concat([train, train_tmp])
    test     = pd.concat([test, test_tmp])
    validate = pd.concat([validate, validate_tmp])
    
    # print split shape
    print("=>", t, tmp.shape, train_tmp.shape, validate_tmp.shape, validate_tmp.shape)

print("\n[Final split]\ntrain, test, validate ==>", train.shape, test.shape, validate.shape)

=> fake (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> satire (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> bias (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> conspiracy (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> junksci (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> hate (3619, 2) (2533, 2) (543, 2) (543, 2)
=> clickbait (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> unreliable (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> political (7500, 2) (5250, 2) (1125, 2) (1125, 2)
=> reliable (6601, 2) (4620, 2) (991, 2) (991, 2)

[Final split]
train, test, validate ==> (49153, 2) (10533, 2) (10534, 2)


In [21]:
df.loc[]

Unnamed: 0,type,content
4427,fake,How To Easily Understand The Difference Betwee...
5553,fake,Butter Chocolate Cheese Cake\n\n% of readers t...
768,fake,Chipotle E. coli O26 Outbreak Update – 19 in W...
58,fake,"U.S. Repeals Propaganda Ban, Spreads Governmen..."
3757,fake,Darwin and the Voyage: 11 ~ Elephants and Hors...
...,...,...
2587,reliable,Are Cats or Dogs More Protective for Children’...
9091,reliable,The IRS “scandal” stampeded Official Washingto...
869,reliable,9/11 & Bush's Negligence By Robert Parry\n\nMa...
4232,reliable,"(Screenshot: YouTube/Desiring God)\n\n""If it i..."


In [136]:
# Split into labels and data
X_train    = train['content']
y_train    = train['type']
X_test     = test['content']
y_test     = test['type']
X_validate = validate['content']
y_validate = validate['type']

In [195]:
# Setup scikit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, log_loss
from sklearn.pipeline import Pipeline

# models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier


# Naive Bayes
clf_NB = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('NB', MultinomialNB()),])
                   
# Support vector machine
clf_svm = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('svm', SGDClassifier()),])

# clf_svm.fit(X_train, y_train)
# y_pred = clf_svm.predict(X_train)
# f1_score(y_train , y_pred, zero_division=1)

In [196]:
# All classifier-pipelines and their names
classifiers = [(clf_NB, 'Naive Bayes'), (clf_svm, 'Support vector machine')]

# Test multiple classifiers
for clf_tuple in classifiers:
    clf = clf_tuple[0]
    clf_name = clf_tuple[1]
    
    # Train model
    clf.fit(X_train, y_train)
    
    print("="*30)
    print(clf_name)
    
    print('****Results****')
    train_predictions = clf.predict(X_train)
    train_acc = f1_score(y_train, train_predictions)
    print("Train - Accuracy: {:.4%}".format(train_acc))
    
    test_predictions = clf.predict(X_test)
    test_acc = f1_score(y_test, test_predictions)
    print("Test  - Accuracy: {:.4%}".format(test_acc))
    
print("="*30)

Naive Bayes
****Results****
Train - Accuracy: 0.0000%
Test  - Accuracy: 0.0000%


ValueError: The number of classes has to be greater than one; got 1 class

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

ModuleNotFoundError: No module named 'tensorflow'

In [133]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# All classifiers to test
classifiers = [
    MultinomialNB()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train_tfidf, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_train)
    acc = f1_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

ValueError: Found input variables with inconsistent numbers of samples: [700, 4620]

In [134]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.7494949494949495

In [135]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()


X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape


from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()


X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4620, 68823)

In [137]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()


X_train_counts = count_vect.fit_transform(train.data)
X_train_counts.shape

AttributeError: 'DataFrame' object has no attribute 'data'

In [138]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Product))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

NameError: name 'category_to_id' is not defined

In [139]:
reg_log = LogisticRegression()
reg_log.fit(X_train_tfidf, y_train)
y_pred = reg_log.predict(X_test_tfidf)

NameError: name 'LogisticRegression' is not defined

In [140]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [990, 860]

In [141]:
print(test.iloc[0])

type                                                    fake
content    our guide to the  NUM  geminid meteors watchin...
Name: 3342, dtype: object


In [142]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [143]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Product))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

NameError: name 'category_to_id' is not defined

In [144]:
# Create test split

In [145]:
# spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')

In [None]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [148]:
textcat.add_label("reliable")
textcat.add_label("fake")

1

In [149]:
train_texts = spam['content'].values
train_labels = [{'cats': {'reliable': label == 'reliable',
                          'fake': label == 'fake'}} 
                for label in spam['type']]

NameError: name 'spam' is not defined

In [150]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

NameError: name 'train_texts' is not defined

In [151]:
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

NameError: name 'train_data' is not defined

In [152]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

NameError: name 'train_data' is not defined

In [153]:


texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)



[[0.5 0.5]
 [0.5 0.5]]


# word vectors

In [154]:
import numpy as np
import spacy

# Need to load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

In [156]:
# Disabling other pipes because we don't need them and it'll speed up this part a bit
text = "These vectors can be used as features for machine learning models."
with nlp.disable_pipes():
    vectors = np.array([token.vector for token in  nlp(text)])

In [157]:
vectors.shape

(12, 300)

In [158]:
import pandas as pd

# Loading the spam data
# ham is the label for non-spam messages

with nlp.disable_pipes():
    doc_vectors = np.array([nlp(text).vector for text in spam.content])
    
doc_vectors.shape

NameError: name 'spam' is not defined

In [159]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['reliable', 'reliable']


In [160]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(doc_vectors, spam.type,
                                                    test_size=0.1, random_state=1)

NameError: name 'doc_vectors' is not defined

In [161]:
from sklearn.svm import LinearSVC

# Set dual=False to speed up training, and it's not needed
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)
print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

ValueError: could not convert string to float: 'how to easily understand the difference between the day of christ and the day of lord in the end times'

In [162]:


def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a) * b.dot(b))


a = nlp("REPLY NOW FOR FREE TEA").vector
b = nlp("According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew into his pot of boiling water.").vector
cosine_similarity(a, b)

0.70300317

In [163]:
connection = psycopg2.connect(user = "postgres",
                                      password = "root",
                                      host = "localhost",
                                      port = "5432",
                                      database = "postgres")

# usage: specify file location, sample size and seed(used by random)
filepath = 'train.tsv'
filepathTest = 'test.tsv'

#filepath = 'news_sample.csv' # <- overwrite for setup
s = 200                    # desired sample size(seems to have slack ie. not exact)
stest = 200                    # desired sample size(seems to have slack ie. not exact)

seed = 1                     # seed used by Pseudorandom number generator

df_train = pd.read_csv(filepath, header = 0, sep='\t').sample(n=s, random_state=seed)
df_train["content"] = df_train["content"].astype(str)

df_test = pd.read_csv(filepathTest, header=0, sep='\t').sample(n=stest, random_state=seed)
df_train["content"] = df_train["content"].astype(str)

df_test = df_test.reset_index(drop=True)
df_test = df_train.reset_index(drop=True)

# df = pd.read_sql_query("""Select content, type_id
# from article""", connection)

FileNotFoundError: [Errno 2] File train.tsv does not exist: 'train.tsv'

In [164]:
a = nlp("REPLY NOW FOR FREE TEA").vector
b = nlp("According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew into his pot of boiling water.").vector
cosine_similarity(a, b)

0.70300317

In [165]:
df_test.head()

NameError: name 'df_test' is not defined

In [166]:
# VIGTIG NOTE: måden jeg opdeler sættet på er MEGET dårlig, på det endelige skal vi shuffle og derefter opdele! 

In [167]:
### Nearest hood (neighbors)

In [168]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(df["content"])

In [None]:
from sklearn.neighbors import NearestCentroid

X = np.array(x.toarray()[int(len(df["type_id"])/2):])
y = np.array(df["type_id"][int(len(df["type_id"])/2):])
clf = NearestCentroid()
clf.fit(X, y)

In [171]:
test_X = x.toarray()[:int(len(df["type_id"])/2)]
test_y = df["type_id"][:int(len(df["type_id"])/2)]

predictions = clf.predict(np.array(test_X))

failCounter = 0
for i in range(int(len(df["type_id"])/2)):
    if (predictions[i] != test_y[i]):
        failCounter += 1
        
print("Number of mislabeled points out of a total %d points : %d" % (len(test_X), failCounter))

MemoryError: Unable to allocate 454. GiB for an array with shape (130530, 466973) and data type float64

In [172]:
### Naive bayes

In [173]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
X_train, X_test, y_train, y_test = train_test_split(x.toarray(), df["type_id"], test_size=0.5, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

MemoryError: Unable to allocate 454. GiB for an array with shape (130530, 466973) and data type float64

In [174]:
### Decision trees

In [175]:
from sklearn import tree

X = x.toarray()[int(len(df["type_id"])/2):]
y = df["type_id"][int(len(df["type_id"])/2):]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_X, train_y)

MemoryError: Unable to allocate 454. GiB for an array with shape (130530, 466973) and data type float64

In [176]:
test_X = x.toarray()[:int(len(df["type_id"])/2)]
test_y = df["type_id"][:int(len(df["type_id"])/2)]

predictions = clf.predict(np.array(train_X))

failCounter = 0
for i in range(int(len(df["type_id"])/2)):
    if (predictions[i] != test_y[i]):
        failCounter += 1
        
print("Number of mislabeled points out of a total %d points : %d" % (len(test_X), failCounter))

MemoryError: Unable to allocate 454. GiB for an array with shape (130530, 466973) and data type float64

In [177]:
### Support Vector Machines

In [178]:
from sklearn import svm
X = x.toarray()[int(len(df["type_id"])/2):]
y = df["type_id"][int(len(df["type_id"])/2):]
clf = svm.SVC(gamma='scale')
clf.fit(X, y)

MemoryError: Unable to allocate 454. GiB for an array with shape (130530, 466973) and data type float64

In [179]:
### note testen gik meget godt, men det betyder ikke man har en god model ;)
clf.predict(np.array(train_X))

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [180]:
test_X = x.toarray()[:int(len(df["type_id"])/2)]
test_y = df["type_id"][:int(len(df["type_id"])/2)]

predictions = clf.predict(np.array(train_X))

failCounter = 0
for i in range(int(len(df["type_id"])/2)):
    if (predictions[i] != test_y[i]):
        failCounter += 1
        
print("Number of mislabeled points out of a total %d points : %d" % (len(test_X), failCounter))

MemoryError: Unable to allocate 454. GiB for an array with shape (130530, 466973) and data type float64

In [181]:
# Med Lair datasæt

In [182]:
### nearest hood

In [183]:
X_train, X_test, y_train, y_test = train_test_split(x.toarray(), df_train["rating"], test_size=0.5, random_state=0)

MemoryError: Unable to allocate 454. GiB for an array with shape (130530, 466973) and data type float64

In [184]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(df_train["content"])

NameError: name 'df_train' is not defined

In [185]:
from sklearn.neighbors import NearestCentroid

clf = NearestCentroid()
clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'how to easily understand the difference between the day of christ and the day of lord in the end times'

In [186]:
### Naive bias

In [187]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

ValueError: could not convert string to float: 'how to easily understand the difference between the day of christ and the day of lord in the end times'

In [188]:
### Decision trees

In [189]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'how to easily understand the difference between the day of christ and the day of lord in the end times'

In [190]:
### Support Vector Machine

In [191]:
from sklearn import svm

clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'how to easily understand the difference between the day of christ and the day of lord in the end times'

In [192]:
### test model

In [193]:
predictions = clf.predict(np.array(test_X))

failCounter = 0
for i in range(int(len(df_test["rating"])/2)):
    if (predictions[i] != test_y[i]):
        failCounter += 1
        
print("Number of mislabeled points out of a total %d points : %d" % (len(test_X), failCounter))

NameError: name 'test_X' is not defined

In [194]:
from sklearn.metrics import f1_score

f1_score(y_test, predictions, average='macro')

NameError: name 'predictions' is not defined