### Import Libraries

In [None]:
import pandas as pd
import os 
from newspaper import Article

## Dataset Creation

### Scraping Real News

In [None]:
path = "/content/UCI-News-Aggregator-Classifier/data/uci-news-aggregator.csv"
df = pd.read_csv(path)
df.head()


In [None]:
df["CATEGORY"].value_counts().plot(kind = 'bar')

In [None]:
df["HOSTNAME"].value_counts()[:20].plot(kind = 'bar')

In [None]:
root = "./articles"
fake = os.path.join(root, "fake")
real = os.path.join(root, "real")

for dir in [root, real, fake]:
  if not os.path.exists(dir):
    os.mkdir(dir)


In [None]:
df2 = df.groupby('CATEGORY').apply(lambda x: x.sample(250))

df2["CATEGORY"].value_counts().plot(kind='bar')

In [None]:
URL_LIST = df2["URL"].tolist()
TITLE_LIST = df2["TITLE"].tolist()
for id_url, article_url in enumerate(URL_LIST):
  article = Article(article_url)
  try:
    # Download and parse article
    article.download()
    article.parse()
    text = article.text

    # Save to file 
    filename = os.path.join(real, "Article_{}.txt".format(id_url))
    article_title = TITLE_LIST[id_url]
    with open(filename, "w") as text_file:
      text_file.write(" %s \n %s" % (article_title, text))

  except:
    print("Could not download the article at: {}".format(article_url))


### Generating Fake News

In [None]:
import gpt_2_simple as gpt2
GPT_MODEL_NAME='774M'

if not os.path.exists('models/'+GPT_MODEL_NAME):
    gpt2.download_gpt2(model_name=GPT_MODEL_NAME)

sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess, model_name=GPT_MODEL_NAME)


In [None]:
for id_title, title in enumerate(TITLE_LIST):
  article=gpt2.generate(sess,
                              model_name=GPT_MODEL_NAME,
                              prefix=title,
                              length=500,
                              temperature=0.8,
                              top_p=0.9,
                              nsamples=1,
                              batch_size=1,
                              return_as_list=True
                              )[0]

  filename = os.path.join(fake, "Article_{}.txt".format(id_url))
  with open(filename, "w") as text_file:
      text_file.write(" %s \n %s" % (title, text))


### Combined Dataset

In [None]:
X = []
Y = []

for file in os.listdir(real):
  try:
    with open(file, "r") as article_file:
      article = file.read()
      X.append(article)
      Y.append(0)
  except:
    print("Error reading: {}".format(file))
    continue
    

for file in os.listdir(fake):
  try:
    with open(file, "r") as article_file:
      article = file.read()
      X.append(article)
      Y.append(1)
  except:
    print("Error reading: {}".format(file))
    continue    


## Feature Extraction

In [None]:
FUNCTION_WORD_FILE = '../static/function_words.txt'
with open(FUNCTION_WORD_FILE,'r') as fwf:
  k = fwf.readlines()
  func_words = [w.rstrip() for w in k]
    
  #There might be duplicates!
  func_words = list(set(func_words))

def calculate_function_words(text):
  function_word_counter = 0
  text_length = len(text.split(' '))
  for word in func_words:
    function_word_counter = function_word_counter + text.count(word)

  if text_length == 0:
    feature = 0
  else:
    feature = function_word_counter / total_length 

  return feature


In [None]:
def calculate_punctuation(text):
  punctuations = =[ k for k in string.punctuation]
  punctuation_counter = 0
  total_length = len(text.split())

  for punc in punctuations:
    punctuation_counter = punctuation_counter + text.count(punc)

  if text_length == 0:
    feature = 0
  else:
    feature = punctuation_counter / total_length 

  return feature


In [None]:
def calculate_ari(text):
  chars = len(text.split())
  words = len(text.split(' '))
  sentences = len(text.split('.'))

  if words == 0 or sentences == 0:
    feature = 0
  else:
    feature = 4.71* (chars / words) + 0.5* (words / sentences) - 21.43
  
  return feature


In [None]:
X_Features = []
for x in X: 
  feature_vector = []
  feature_vector.append(calculate_function_words(x))
  feature_vector.append(calculate_punctuation(x))
  feature_vector.append(calculate_ari(x))

  X_Features.append(feature_vector)


### Helper function for Evaluation

In [None]:
from sklearn.metrics import confusion_matrix

def evaluate_model(actual, predicted):
  confusion = confusion_matrix(actual, predicted)
  tn, fp, fn, tp = confusion.ravel()

  total = tp + fp + tn + fn

  accuracy = 100 * (tp + tn) / total
  if tp + fp != 0:
    precision = tp / (tp + fp)
  else:
    precision = 0

  if tp + fn != 0:
    recall = tp / (tp + fn)
  else:
    recall = 0

  if precision == 0 or recall == 0:
    f1 = 0
  else:
    f1 = 2 * precision * recall / (precision + recall)

  evaluation = { 'accuracy': accuracy,
                 'precision': precision,
                 'recall': recall,
                 'f1': f1}

  return evaluation


## Model Training

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, Y_train, Y_test = train_test_split(X_Features, Y)


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, Y_train)
Y_predicted = model.predict(X_test)
print(evaluate_model(Y_test, Y_pred))


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100) 
model.fit(X_train, Y_train)
Y_predicted = model.predict(X_test)
print(evaluate_model(Y_test, Y_pred))


### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes = (50, 25, 10), 
                      max_iter = 100,
                      activation = 'relu',
                      solver = 'adam',
                      random_state = 123)
model.fit(X_train, Y_train)
Y_predicted = model.predict(X_test)
print(evaluate_model(Y_test, Y_pred))


### Support Vector Machine

In [None]:
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(X_train, Y_train)
Y_predicted = model.predict(X_test)
print(evaluate_model(Y_test, Y_pred))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
X_train_TFIDF = tf_idf.fit_transform(X_train)
X_test_TFIDF = tf_idf.transform(X_test)


In [None]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes = (300, 200, 100), 
                      max_iter = 100,
                      activation = 'relu',
                      solver = 'adam',
                      random_state = 123)
model.fit(X_train_TFIDF, Y_train)
Y_predicted = model.predict(X_test_TFIDF)
print(evaluate_model(Y_test, Y_pred))


In [None]:
import nltk
nltk.download('punkt')
corpus = []
for x in X_train:
  # Split into sentences
  sentences_tokens = nltk.sent_tokenize(x)
  # Split each sentence into words
  word_tokens = [nltk.word_tokenize(sent) for sent in sentences_tokens]
  # Add to corpus
  corpus = corpus + word_tokens


In [None]:
from gensim.models import Word2Vec
model = Word2Vec(corpus, min_count=1, vector_size = 30)


In [None]:
X_train_vector_mean = []
for x in X_train:
  # Create a 30-element vector with all zeroes
  vector = [0 for _ in range(30)]
  # Create a vector for out-of-vocab words
  oov = [0 for _ in range(30)]
  
  words = x.split(' ')
  for word in words:
    if word in model.wv.vocab:
      # Word is present in the vocab
      vector = np.sum([vector, model[word]], axis = 0)
    else:
      # Out of Vocabulary
      vector = np.sum([vector, oov], axis = 0)

  # Calculate the mean 
  mean_vector = vector / len(words)
  X_train_vector_mean.append(mean_vector)


In [None]:
X_train_vector_appended = []
max_words = 40
for x in X_train:
  words = x.split(' ')
  num_words = max(max_words, len(words))
  feature_vector = []
  for word in words[:num_words]:
    if word in model.wv.vocab:
      # Word is present in the vocab
      vector = np.sum([vector, model[word]], axis = 0)
    else:
      # Out of Vocabulary
      vector = np.sum([vector, oov], axis = 0)
    feature_vector = feature_vector + vector

  if num_words < max_words:
    pads = [0 for _ in range(30*(max_words-num_words))]
    feature_vector = feature_vector + pads

  X_train_vector_appended.append(feature_vector)


In [None]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes = (1000, 700, 500, 200),
                      max_iter = 100,
                      activation = 'relu',
                      solver = 'adam',
                      random_state = 123)
model.fit(X_train_vector_appended, Y_train)
Y_predicted = model.predict(X_test_vector_appended)
print(evaluate_model(Y_test, Y_pred))


In [None]:
import torch
from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base uncased',
                                 output_hidden_states=True)

model.eval()


In [None]:
X_train_BERT = []
for x in X_train:
  # Add CLS and SEP
  marked_text = "[CLS] " + x + " [SEP]"
  # Split the sentence into tokens.
  tokenized_text = tokenizer.tokenize(marked_text)
  # Map the token strings to their vocabulary indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  tokens_tensor = torch.tensor([indexed_tokens])
  with torch.no_grad():
    outputs = model(tokens_tensor)
    feature_vector = outputs[0]

  X_train_BERT.append(feature_vector)


In [None]:
X_train_BERT = []
for x in X_train:
  # Add CLS and SEP
  marked_text = "[CLS] " + x + " [SEP]"
  # Split the sentence into tokens.
  tokenized_text = tokenizer.tokenize(marked_text)
  # Map the token strings to their vocabulary indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  tokens_tensor = torch.tensor([indexed_tokens])
  with torch.no_grad():
    outputs = model(tokens_tensor)
    hidden_states = outputs[2]
    feature_vector = torch.stack(hidden_states).sum(0)
  X_train_BERT.append(feature_vector)


In [None]:
X_train_BERT = []
for x in X_train:
  # Add CLS and SEP
  marked_text = "[CLS] " + x + " [SEP]"
  # Split the sentence into tokens.
  tokenized_text = tokenizer.tokenize(marked_text)
  # Map the token strings to their vocabulary indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  tokens_tensor = torch.tensor([indexed_tokens])
  with torch.no_grad():
    outputs = model(tokens_tensor)
    hidden_states = outputs[2]
    feature_vector = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)
  X_train_BERT.append(feature_vector)


In [None]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes = (1000, 700, 500, 200),
                      max_iter = 100,
                      activation = 'relu',
                      solver = 'adam',
                      random_state = 123)
model.fit(X_train_BERT, Y_train)
Y_predicted = model.predict(X_test_BERT)
print(evaluate_model(Y_test, Y_pred))
