### Thesis ADS: Rosa Lucassen

# Loading the data

In [1]:
import pickle
import pandas as pd
import numpy as np
from numpy import array

import spacy
import stop_words
nlp = spacy.load("en_core_web_sm")
nlp.vocab["covid"].is_stop = True
nlp.vocab["corona"].is_stop = True
nlp.vocab["covid19"].is_stop = True
nlp.vocab["covid-19"].is_stop = True
nlp.vocab["coronavirus"].is_stop = True

import re
import emoji
from collections import Counter
from scipy.stats import chi2_contingency
from urllib.parse import urlparse
from nltk.stem.porter import PorterStemmer

import gensim
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import logging
import pyLDAvis.gensim
import json

import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
stemmer = PorterStemmer()

# Function: Cleaning data

In [2]:
def remove_single_char(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

def remove_numbers(text):
    number_pattern = r'\b(?<![0-9-])(\d+)(?![0-9-])'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number


def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    without_urls = re.sub(pattern=url_pattern, repl=" ", string=text)
    return without_urls


def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    without_emoji = emoji_pattern.sub(r'',text)
    return without_emoji

def remove_nonalpha(text):
    nonalpha_pattern = "[+$@#?~]"
    without_nonalpha = re.sub(pattern=nonalpha_pattern, repl=" ", string=text)
    return without_nonalpha
    
def remove_rt(text):
    rt_pattern = 'rt @[\w_]+: '
    without_pattern = re.sub(pattern = rt_pattern, repl=" ", string = text)
    return without_pattern 

def remove_extra_spaces(text):
    space_pattern = r'\s+'
    without_space = re.sub(pattern=space_pattern, repl=" ", string=text)
    return without_space

def strip_list_noempty(mylist):
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']

# Function: BiGrams

In [3]:
from gensim.models import Phrases
def make_n_grams(docs):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=1)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
    return docs

# Function: Lemmatizing texts

In [None]:
def pre_processer_real(corpus):
    corpus = [text.lower() for text in corpus]
    corpus = [remove_urls(text) for text in corpus]
    corpus = [remove_single_char(text) for text in corpus]
    corpus = [remove_numbers(text) for text in corpus]
    corpus = [remove_rt(text) for text in corpus]
    corpus = [remove_emojis(text) for text in corpus]
    corpus = [remove_nonalpha(text) for text in corpus]
    corpus = [text.encode("ascii", "ignore") for text in corpus]
    corpus = [text.decode() for text in corpus]
    corpus = [remove_extra_spaces(text) for text in corpus]
    corpus = strip_list_noempty(corpus)
    
    processed_texts = [text for text in nlp.pipe(corpus, 
                                              disable=["ner",
                                                       "parser"])]
    tokenized_texts = [[word.lemma_ for word in lemma_ if not word.is_punct if not word.is_stop] 
                            for lemma_ in processed_texts]
    tokenized_texts = make_n_grams(tokenized_texts)
    
    flatten = lambda t: [item for sublist in t for item in sublist]
    flat_real = flatten(tokenized_texts)
    counts = Counter(flat_real)
    total_words = sum(counts.values()) 
    for word in counts:
        counts[word] = counts[word] #* (10000) / total_words
    return counts, counts.most_common(), flat_real, tokenized_texts

def pre_processer_fake(corpus):
    corpus = [text.lower() for text in corpus]
    corpus = [remove_urls(text) for text in corpus]
    corpus = [remove_single_char(text) for text in corpus]
    corpus = [remove_numbers(text) for text in corpus]
    corpus = [remove_rt(text) for text in corpus]
    corpus = [remove_emojis(text) for text in corpus]
    corpus = [remove_nonalpha(text) for text in corpus]
    corpus = [text.encode("ascii", "ignore") for text in corpus]
    corpus = [text.decode() for text in corpus]
    corpus = [remove_extra_spaces(text) for text in corpus]
    corpus = strip_list_noempty(corpus)

    processed_texts = [text for text in nlp.pipe(corpus, 
                                              disable=["ner",
                                                       "parser"])]
    tokenized_texts = [[word.lemma_ for word in lemma_ if not word.is_punct if not word.is_stop] 
                            for lemma_ in processed_texts]
    tokenized_texts = make_n_grams(tokenized_texts)
    
    flatten = lambda t: [item for sublist in t for item in sublist]
    flat_fake = flatten(tokenized_texts)
    counts = Counter(flat_fake)
    total_words = sum(counts.values()) 
    for word in counts:
        counts[word] = counts[word] #* (10000) / total_words
    return counts, counts.most_common(), flat_fake, tokenized_texts

# Function: distinctive words

In [None]:
def distinctive_words(target_corpus, reference_corpus):
    counts_c1 = Counter(target_corpus) # don't forget to flatten your texts!
    counts_c2 = Counter(reference_corpus)
    vocabulary = set(list(counts_c1.keys()) + list(counts_c2.keys()))
    freq_c1_total = sum(counts_c1.values()) 
    freq_c2_total = sum(counts_c2.values()) 
    results = []
    for word in vocabulary:
        freq_c1 = counts_c1[word]
        freq_c2 = counts_c2[word]
        freq_c1_other = freq_c1_total - freq_c1
        freq_c2_other = freq_c2_total - freq_c2
        llr, p_value,_,_ = chi2_contingency([[freq_c1, freq_c2], 
                      [freq_c1_other, freq_c2_other]],
                      lambda_='log-likelihood') 
        if freq_c2 / freq_c2_other > freq_c1 / freq_c1_other:
            llr = -llr
        result = {'word':word, 
                    'llr':llr,
                    'p_value': p_value}
        results.append(result)
    results_df = pd.DataFrame(results)
    return results_df

# Function: Topic modelling LDA Gensim

In [None]:
# Shows all words instead of just the 10 most popular words
def topic_modelling_results(tokenized_text, n_topics, n_iterations):
    results = []

    dictionary = Dictionary(tokenized_text) # get the vocabulary
    corpus = [dictionary.doc2bow(text) for text in tokenized_text]

    PATH_TO_MALLET = '/Users/rosalucassen/Documents/UU/DataMining/mallet-2.0.8/bin/mallet'
    N_TOPICS = n_topics
    N_ITERATIONS = n_iterations

    lda = LdaMallet(PATH_TO_MALLET,
                    corpus=corpus,
                    id2word=dictionary,
                    num_topics=N_TOPICS,
                    iterations=N_ITERATIONS)
    
    for topic in range(N_TOPICS):
        words = lda.show_topic(topic)
        topic_n_words = ' '.join([word[0] for word in words])
        result = {'Topic':str(topic), 
                    'Words':topic_n_words}
        results.append(result)
    results_DF = pd.DataFrame(results)
    pd.set_option('display.max_colwidth', None)
    return results_DF, lda, dictionary

# Function: Compute Coherence Scores

In [None]:
def compute_coherence_values(texts, start, step, limit, topn):
    dictionary = Dictionary(texts) # get the vocabulary
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in texts]

    PATH_TO_MALLET = '/Users/rosalucassen/Documents/UU/DataMining/mallet-2.0.8/bin/mallet'
    
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(PATH_TO_MALLET, corpus=corpus, num_topics=num_topics, id2word=dictionary, alpha=0.05, iterations = 100)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v', topn=topn)
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

## Exploring the data

In [4]:
d_test = pd.read_excel("/Users/rosalucassen/Desktop/Thesis/Code/Data/Constraint_English_Test2.xlsx", engine='openpyxl')
d_train = pd.read_excel("/Users/rosalucassen/Desktop/Thesis/Code/Data/Constraint_English_Train.xlsx", engine='openpyxl')
d_val = pd.read_excel("/Users/rosalucassen/Desktop/Thesis/Code/Data/Constraint_English_Val.xlsx", engine='openpyxl')
d_test_labeled = pd.read_excel("/Users/rosalucassen/Desktop/Thesis/Code/Data/english_test_with_labels.xlsx", engine='openpyxl')

#### Training data

In [None]:
d_train.info()

In [None]:
d_train.head()

In [None]:
d_train["label"].unique()

In [None]:
grouped_train = d_train.groupby('label').count() 
grouped_train.head()

#### Validation data

In [None]:
d_val.info()

In [None]:
d_val.head()

In [None]:
d_val["label"].unique()

In [None]:
grouped_val = d_val.groupby('label').count() 
grouped_val.head()

#### Test data

In [None]:
d_test.info()

In [None]:
d_test.head()

In [None]:
d_test_labeled.info()

In [None]:
d_test_labeled.head()

In [None]:
d_test_labeled["label"].unique()

In [None]:
grouped_test = d_test_labeled.groupby('label').count() 
grouped_test.head()

#### Combine data

In [5]:
data_train = pd.concat([d_train, d_test_labeled, d_val])
data_train.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [None]:
data_train.info()

In [None]:
grouped_train = data_train.groupby('label').count() 
grouped_train.head()

## Results: Pre-processing

In [None]:
corpus_train_fake = data_train[data_train.label == "fake"]["tweet"]
corpus_train_real = data_train[data_train.label == "real"]["tweet"]
word_count_fake, output_fake, flat_fake, tokenized_texts_fake = pre_processer_fake(corpus_train_fake)
print(output_fake[:20])
word_count_real, output_real, flat_real, tokenized_texts_real = pre_processer_real(corpus_train_real)
print(output_real[:20])

In [None]:
import nltk
from nltk import FreqDist
fdist_filtered = FreqDist(word_count_fake)
fdist_filtered.plot(70,title='Frequency distribution fake news')

In [None]:
fdist_filtered = FreqDist(word_count_real)
fdist_filtered.plot(70,title='Frequency distribution fake news')

# Results: Optimal number of topics

In [None]:
all_scores = []
for n in range(5, 15, 1):
    model_list, coherence_values = compute_coherence_values(texts=tokenized_texts_fake, start=1, step=1, limit=12, topn=n)
    all_scores.append(coherence_values)

In [None]:
all_scores = np.array(all_scores)
avg_scores = np.average(all_scores, axis=0)

In [None]:
print(avg_scores)

In [None]:
#Using the c_v measure
#model_list, coherence_values = compute_coherence_values(texts=tokenized_texts_fake, start=2, limit=40, step=6)
# Show graph
import matplotlib.pyplot as plt
limit=12; start=1; step=1;
x = range(start, limit, step)
plt.plot(x, avg_scores)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
all_scores = []
for n in range(5, 20, 5):
    model_list, coherence_values = compute_coherence_values(texts=tokenized_texts_real, start=1, step=1, limit=30, topn=n)
    all_scores.append(coherence_values)

In [None]:
all_scores = np.array(all_scores)
avg_scores = np.average(all_scores, axis=0)

In [None]:
print(avg_scores)

In [None]:
# Show graph
import matplotlib.pyplot as plt
limit=30; start=1; step=1;
x = range(start, limit, step)
plt.plot(x, avg_scores)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

## Results: Topic modelling LDA Gensim

#### Fake news

In [None]:
topics, lda_fake, dictionary = topic_modelling_results(tokenized_texts_fake, 4, n_iterations = 1000)
topics

#### Real news

In [None]:
topics_real, lda_real, dictionary = topic_modelling_results(tokenized_texts_real, 25, n_iterations = 1000)
topics_real

## Document topic distribution

###### Loop over documents to get topic distribution: Fake

In [None]:
transformed_docs = lda_fake.load_document_topics()

In [None]:
for i, document in enumerate(transformed_docs):
    print('Topic distributions for document {}'.format(i))
    for topic in document:
        print(topic)

In [None]:
transformed_docs = lda_fake.load_document_topics()
topic_distributions_fake = pd.DataFrame([[x[1] for x in doc] for doc in transformed_docs], 
             columns=['topic_{}'.format(i) for i in range(8)])
topic_distributions_fake.tail()
topic_distributions_fake.mean().sort_values(ascending=False)

###### Loop over documents to get topic distribution: Real

In [None]:
transformed_docs = lda_real.load_document_topics()
for i, document in enumerate(transformed_docs):
    print('Topic distributions for document {}'.format(i))
    for topic in document:
        print(topic)

In [None]:
transformed_docs = lda_real.load_document_topics()
topic_distributions_real = pd.DataFrame([[x[1] for x in doc] for doc in transformed_docs], 
             columns=['topic_{}'.format(i) for i in range(20)])
topic_distributions_real.tail()
#topic_distributions.mean().sort_values(ascending=False)

## Find optimal number of topics using Gensim method 2

This methods takes a very long time, but will also give alpha and beta values

In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=k, alpha = a, eta = b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_texts_fake, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()
    

In [None]:
import numpy as np
import tqdm
grid = {}

dictionary = Dictionary(tokenized_texts_real) # get the vocabulary
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts_real]
PATH_TO_MALLET = '/Users/rosalucassen/Documents/UU/DataMining/mallet-2.0.8/bin/mallet'

#grid['Validation_Set'] = {}
# Topics range
min_topics = 5
max_topics = 40
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
#corpus = [dictionary.doc2bow(text) for text in tokenized_texts_real]
num_of_docs = len(corpus)
#corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               #gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               #corpus]
#corpus_title = ['75% Corpus', '100% Corpus']
model_results = {#'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=270)
    
    # iterate through validation corpuses
    #for i in range(len(corpus_sets)):
        # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpus, dictionary=dictionary, k=k, a=a, b=b)
                # Save the model results
                #model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                    
                pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results_real.csv', index=False)
    pbar.close()

# Pre-processing emotion detection data

In [6]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp.vocab["covid"].is_stop = True
nlp.vocab["corona"].is_stop = True
nlp.vocab["covid19"].is_stop = True
nlp.vocab["covid-19"].is_stop = True
nlp.vocab["coronavirus"].is_stop = True
nlp.vocab["COVID-19"].is_stop = True
nlp.vocab["covid"].is_stop = True
nlp.vocab["corona"].is_stop = True
nlp.vocab["covid19"].is_stop = True
nlp.vocab["covid-19"].is_stop = True
nlp.vocab["coronavirus"].is_stop = True
nlp.vocab["case"].is_stop = True
nlp.vocab["Coronavirus"].is_stop = True
nlp.vocab["Corona"].is_stop = True
nlp.vocab["virus"].is_stop = True
nlp.vocab["Virus"].is_stop = True
nlp.vocab["COVID"].is_stop = True
nlp.vocab["COVID-19."].is_stop = True
nlp.vocab["UPDATE"].is_stop = True
nlp.vocab["update"].is_stop = True
nlp.vocab["Update"].is_stop = True
nlp.vocab["Case"].is_stop = True
nlp.vocab["case"].is_stop = True
nlp.vocab["Cases"].is_stop = True
nlp.vocab["cases"].is_stop = True
nlp.vocab["New"].is_stop = True
nlp.vocab["new"].is_stop = True
nlp.vocab["Report"].is_stop = True
nlp.vocab["REPORT"].is_stop = True
nlp.vocab["report"].is_stop = True
nlp.vocab["article"].is_stop = True
nlp.vocab["Article"].is_stop = True
nlp.vocab["claim"].is_stop = True
nlp.vocab["test"].is_stop = True
nlp.vocab["tested"].is_stop = True
nlp.vocab["tests"].is_stop = True
nlp.vocab["Test"].is_stop = True
nlp.vocab["Tested"].is_stop = True
nlp.vocab["Tests"].is_stop = True

In [10]:
def pre_processer_real(corpus):
    #corpus = [text.lower() for text in corpus]
    corpus = [remove_urls(text) for text in corpus]
    corpus = [remove_single_char(text) for text in corpus]
    corpus = [remove_numbers(text) for text in corpus]
    corpus = [remove_rt(text) for text in corpus]
    #corpus = [remove_emojis(text) for text in corpus]
    corpus = [remove_nonalpha(text) for text in corpus]
    corpus = [text.encode("ascii", "ignore") for text in corpus]
    corpus = [text.decode() for text in corpus]
    
    my_doc = [nlp(text) for text in corpus]
    
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    # Create list of word tokens after removing stopwords
    filtered_posts = []
    for post in token_list:
        words = post.split()
        post_new = ""
        for word in words:
            if nlp.vocab[word].is_stop == False:
                post_new += " " + word
        filtered_posts.append(post_new)
    filtered_posts = [remove_extra_spaces(text) for text in filtered_posts]
    filtered_posts = strip_list_noempty(filtered_posts)
    return filtered_posts

def pre_processer_fake(corpus):
    #corpus = [text.lower() for text in corpus]
    corpus = [remove_urls(text) for text in corpus]
    corpus = [remove_single_char(text) for text in corpus]
    corpus = [remove_numbers(text) for text in corpus]
    corpus = [remove_rt(text) for text in corpus]
    #corpus = [remove_emojis(text) for text in corpus]
    corpus = [remove_nonalpha(text) for text in corpus]
    corpus = [text.encode("ascii", "ignore") for text in corpus]
    corpus = [text.decode() for text in corpus]
    
    my_doc = [nlp(text) for text in corpus]
    
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    # Create list of word tokens after removing stopwords
    filtered_posts = []
    for post in token_list:
        words = post.split()
        post_new = ""
        for word in words:
            if nlp.vocab[word].is_stop == False:
                post_new += " " + word
        filtered_posts.append(post_new)
    filtered_posts = [remove_extra_spaces(text) for text in filtered_posts]
    filtered_posts = strip_list_noempty(filtered_posts)
    return filtered_posts

In [None]:
token_list = []
for token in my_doc:
    token_list.append(token.text)

from spacy.lang.en.stop_words import STOP_WORDS

# Create list of word tokens after removing stopwords
filtered_sentence =[] 

for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_sentence.append(word) 
print(token_list)
print(filtered_sentence)   

In [11]:
corpus_fake = data_train[data_train.label == "fake"]["tweet"]
corpus_real = data_train[data_train.label == "real"]["tweet"]
#corpus = data_train["tweet"]
corpus_fake = pre_processer_fake(corpus_fake)
print(corpus_fake)
corpus_real = pre_processer_real(corpus_real)
print(corpus_real)
#corpus = pre_processer_fake(corpus)





In [12]:
count = 0
for tweet in corpus_fake:
    if (sum(len(i) for i in tweet) < 200):
        count += 1
print(count)

4889


In [13]:
count = 0
for tweet in corpus_real:
    if (sum(len(i) for i in tweet) < 200):
        count += 1
print(count)

5263


## VADER Sentiment Analysis

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

#### Real News

In [None]:
df_real = pd.DataFrame(corpus_real)
df_real.columns = ["Tweet"]
df_real

In [None]:
df_real['scores'] = df_real['Tweet'].apply(lambda tweet: sid.polarity_scores(tweet))
df_real.head()

In [None]:
df_real['compound']  = df_real['scores'].apply(lambda score_dict: score_dict['compound'])
df_real.head()

In [None]:
# Create a function that returns desired values
# You only need to check upper bound as the next elif-statement will catch the value
def func(compound):
    if compound < -0.1:
        return "neg"
    elif compound > 0.1:
        return "pos"
    else:
        return 'neu'
df_real['Class'] = df_real['compound'].apply(func)
df_real.head()

In [None]:
df_real["compound"].mean()

In [None]:
df_real['compound'].plot(kind = 'hist')

#### Fake News

In [None]:
df_fake = pd.DataFrame(corpus_fake)
df_fake.columns = ["Tweet"]
df_fake

In [None]:
df_fake['scores'] = df_fake['Tweet'].apply(lambda tweet: sid.polarity_scores(tweet))
df_fake.head()

In [None]:
df_fake['compound']  = df_fake['scores'].apply(lambda score_dict: score_dict['compound'])
df_fake.head()

In [None]:
# Create a function that returns desired values
# You only need to check upper bound as the next elif-statement will catch the value
def func(compound):
    if compound < -0.1:
        return "neg"
    elif compound > 0.1:
        return "pos"
    else:
        return 'neu'
df_fake['Class'] = df_fake['compound'].apply(func)
df_fake.head()

In [None]:
df_fake["compound"].mean()

In [None]:
df_fake['compound'].plot(kind = 'hist')

#### All News

In [None]:
df = pd.DataFrame(corpus)
df.columns = ["Tweet"]
df

In [None]:
df['scores'] = df['Tweet'].apply(lambda tweet: sid.polarity_scores(tweet))
df.head()

In [None]:
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df.head()

In [None]:
# Create a function that returns desired values
# You only need to check upper bound as the next elif-statement will catch the value
def func(compound):
    if compound < -0.1:
        return "neg"
    elif compound > 0.1:
        return "pos"
    else:
        return 'neu'
df['Class'] = df['compound'].apply(func)
df.head()

In [None]:
df["compound"].mean()

In [None]:
df['compound'].plot(kind = 'hist')

## LIWC Sentiment Analysis

#### Fake news

In [None]:
def pre_processer(corpus):
    corpus = [text.lower() for text in corpus]
    corpus = [remove_urls(text) for text in corpus]
    corpus = [remove_single_char(text) for text in corpus]
    corpus = [remove_numbers(text) for text in corpus]
    corpus = [remove_rt(text) for text in corpus]
    corpus = [remove_emojis(text) for text in corpus]
    corpus = [remove_nonalpha(text) for text in corpus]
    corpus = [text.encode("ascii", "ignore") for text in corpus]
    corpus = [text.decode() for text in corpus]
    corpus = [remove_extra_spaces(text) for text in corpus]
    corpus = strip_list_noempty(corpus)
    
    return corpus

import liwc
parse, category_names = liwc.load_token_parser('liwc.dic')

In [None]:
corpus_fake = data_train[data_train.label == "fake"]["tweet"]
corpus_real = data_train[data_train.label == "real"]["tweet"]
corpus = data_train["tweet"]
corpus_fake = pre_processer(corpus_fake)
print(corpus_fake)
corpus_real = pre_processer(corpus_real)
print(corpus_real)
corpus = pre_processer(corpus)

In [None]:
#from collections import Counter
feature_counts = Counter(category for token in corpus_fake for category in parse(token))
print(feature_counts)

#### Real news

In [None]:
#from collections import Counter
feature_counts = Counter(category for token in corpus_real for category in parse(token))
print(feature_counts)

#### All news

In [None]:
#from collections import Counter
feature_counts = Counter(category for token in corpus for category in parse(token))
print(feature_counts)

## Results: Most distinctive words
This gives an extreme output, why?

In [None]:
results_df = distinctive_words(flat_real, flat_fake)
results_df.sort_values('llr', ascending=False).head(5)