# <u><center>Comparing Texts Core
- Authored by: Eric N. Valdez
- Date: 03/28/2024

# <u>Comparing Texts:
- In this assignment you will prepare and conduct EDA on a text dataset using SpaCy, NLTK, and WordCount
- The dataet contains labeled real and fake news articles from around the 2017 US Presidential Elections. The dataser was originally from [Kaggle](https://www.kaggle.com/datasets/subhajournal/fake-and-real-news-data) and is licensed under the [GNU AFFERO GERNERLA PUBLIC LICENSE](https://www.gnu.org/licenses/agpl-3.0.html)
  
-  You can also download the data from [here](https://drive.google.com/file/d/16Dqjymk1tdZWPxexs7lM7z1RTk405PAL/view).
  
- If you haven't already, be sure to install SpaCy and download the English language model 

In [1]:
# !pip install spacy
# from spacy.cli import download
# download('en_core_web_sm')

# `Imports:`

In [2]:
import re
import matplotlib.pyplot as plt
import spacy
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import numpy as np
import nltk


from wordcloud import WordCloud
from wordcloud import STOPWORDS
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk import ngrams
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# `Custom Functions & Preprocessing:`

In [3]:
def batch_preprocess_texts(
    texts,
    nlp=None,
    remove_stopwords=True,
    remove_punct=True,
    use_lemmas=False,
    disable=["ner"],
    batch_size=50,
    n_process=-1,
):
    """Efficiently preprocess a collection of texts using nlp.pipe()
    Args:
        texts (collection of strings): collection of texts to process (e.g. df['text'])
        nlp (spacy pipe), optional): Spacy nlp pipe. Defaults to None; if None, it creates a default 'en_core_web_sm' pipe.
        remove_stopwords (bool, optional): Controls stopword removal. Defaults to True.
        remove_punct (bool, optional): Controls punctuation removal. Defaults to True.
        use_lemmas (bool, optional): lemmatize tokens. Defaults to False.
        disable (list of strings, optional): named pipeline elements to disable. Defaults to ["ner"]: Used with nlp.pipe(disable=disable)
        batch_size (int, optional): Number of texts to process in a batch. Defaults to 50.
        n_process (int, optional): Number of CPU processors to use. Defaults to -1 (meaning all CPU cores).
    Returns:
        list of tokens
    """
    # from tqdm.notebook import tqdm
    from tqdm import tqdm
    if nlp is None:
        nlp = spacy.load("en_core_web_sm")
    processed_texts = []
    for doc in tqdm(nlp.pipe(texts, disable=disable, batch_size=batch_size, n_process=n_process)):
        tokens = []
        for token in doc:
            # Check if should remove stopwords and if token is stopword
            if (remove_stopwords == True) and (token.is_stop == True):
                # Continue the loop with the next token
                continue
            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_punct == True):
                continue
            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_space == True):
                continue
            
            ## Determine final form of output list of tokens/lemmas
            if use_lemmas:
                tokens.append(token.lemma_.lower())
            else:
                tokens.append(token.text.lower())
        processed_texts.append(tokens)
    return processed_texts


In [None]:
def make_custom_nlp(
    disable=["ner"],
    contractions=["don't", "can't", "couldn't", "you'd", "I'll"],
    stopwords_to_add=[],
    stopwords_to_remove=[],
    spacy_model = "en_core_web_sm"
):
    """Returns a custom spacy nlp pipeline.
    
    Args:
        disable (list, optional): Names of pipe components to disable. Defaults to ["ner"].
        contractions (list, optional): List of contractions to add as special cases. Defaults to ["don't", "can't", "couldn't", "you'd", "I'll"].
        stopwords_to_add(list, optional): List of words to set as stopwords (word.is_stop=True)
        stopwords_to_remove(list, optional): List of words to remove from stopwords (word.is_stop=False)
        spacy_model(string, optional): String to select a spacy language model. (Defaults to "en_core_web_sm".)
                            Additional Options:  "en_core_web_md", "en_core_web_lg"; 
                            (Must first download the model by name in the terminal:
                            e.g.  "python -m spacy download en_core_web_lg" )
            
    Returns:
        nlp pipeline: spacy pipeline with special cases and updated nlp.Default.stopwords
    """
    # Load the English NLP model
    nlp = spacy.load(spacy_model, disable=disable)
    
    # Adding Special Cases 
    # Loop through the contractions list and add special cases
    for contraction in contractions:
        special_case = [{"ORTH": contraction}]
        nlp.tokenizer.add_special_case(contraction, special_case)
    
    # Adding stopwords
    for word in stopwords_to_add:
        # Set the is_stop attribute for the word in the vocab dict to true.
        nlp.vocab[
            word
        ].is_stop = True  # this determines spacy's treatmean of the word as a stop word
        # Add the word to the list of stopwords (for easily tracking stopwords)
        nlp.Defaults.stop_words.add(word)
    
    # Removing Stopwords
    for word in stopwords_to_remove:
        
        # Ensure the words are not recognized as stopwords
        nlp.vocab[word].is_stop = False
        nlp.Defaults.stop_words.discard(word)
        
    return nlp

In [4]:
def get_ngram_measures_finder(tokens, ngrams=2, measure='raw_freq', top_n=None, min_freq = 1,
                             words_colname='Words'):
    import nltk
    if ngrams == 4:
        MeasuresClass = nltk.collocations.QuadgramAssocMeasures
        FinderClass = nltk.collocations.QuadgramCollocationFinder
        
    elif ngrams == 3: 
        MeasuresClass = nltk.collocations.TrigramAssocMeasures
        FinderClass = nltk.collocations.TrigramCollocationFinder
    else:
        MeasuresClass = nltk.collocations.BigramAssocMeasures
        FinderClass = nltk.collocations.BigramCollocationFinder

    measures = MeasuresClass()
    
   
    finder = FinderClass.from_words(tokens)
    finder.apply_freq_filter(min_freq)
    if measure=='pmi':
        scored_ngrams = finder.score_ngrams(measures.pmi)
    else:
        measure='raw_freq'
        scored_ngrams = finder.score_ngrams(measures.raw_freq)

    df_ngrams = pd.DataFrame(scored_ngrams, columns=[words_colname, measure.replace("_",' ').title()])
    if top_n is not None:
        return df_ngrams.head(top_n)
    else:
        return df_ngrams


In [5]:
# Increase column width
pd.set_option('display.max_colwidth', 250)

# `Load Data:`

In [6]:
fpath = 'Data/Fake_Real_News_Data.csv'
df=pd.read_csv(fpath)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,A whirlwind day in D.C. showcases Trump’s unorthodox views and shifting tone,"Donald Trump endorsed an unabashedly noninterventionist approach to world affairs Monday during a day-long tour of Washington, casting doubt on the need for the North Atlantic Treaty Organization and expressing skepticism about a muscular U.S. mi...",REAL
1,1,"In Baltimore's call for federal police probe, a new search for answers (+video)","While some Justice Department investigations are adversarial, a new model of collaborative reform is surprising police in some cities, as they find themselves included as part of the solution.\n\nSearching for a ""framework ... [to] heal,"" Baltimo...",REAL
2,2,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It,"Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It By Andrew Bradford on October 27, 2016 Subscribe \nArrogance is defined as “an insulting way of thinking or behaving that comes from believing that you are better, smarter, or m...",FAKE
3,3,Inside the Trump-Bush melodrama: Decades of tension and discomfort,"Donald Trump spent a day in January 2014 hobnobbing with politicians at the Trump International Golf Club in West Palm Beach, Fla. The billionaire mogul touted legalizing gambling with state Rep. Steve Crisafulli, speaker of the Florida House, an...",REAL
4,4,Shutdown clash to return in force by December,"Notable names include Ray Washburne (Commerce), a Dallas-based investor, is reported to be under consideration to lead the department.",REAL


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


## **1. Clean the Data**
- Remove any unnecessary columns and check for/ remove duplicates.

In [8]:
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,title,text,label
0,A whirlwind day in D.C. showcases Trump’s unorthodox views and shifting tone,"Donald Trump endorsed an unabashedly noninterventionist approach to world affairs Monday during a day-long tour of Washington, casting doubt on the need for the North Atlantic Treaty Organization and expressing skepticism about a muscular U.S. mi...",REAL
1,"In Baltimore's call for federal police probe, a new search for answers (+video)","While some Justice Department investigations are adversarial, a new model of collaborative reform is surprising police in some cities, as they find themselves included as part of the solution.\n\nSearching for a ""framework ... [to] heal,"" Baltimo...",REAL
2,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It,"Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It By Andrew Bradford on October 27, 2016 Subscribe \nArrogance is defined as “an insulting way of thinking or behaving that comes from believing that you are better, smarter, or m...",FAKE
3,Inside the Trump-Bush melodrama: Decades of tension and discomfort,"Donald Trump spent a day in January 2014 hobnobbing with politicians at the Trump International Golf Club in West Palm Beach, Fla. The billionaire mogul touted legalizing gambling with state Rep. Steve Crisafulli, speaker of the Florida House, an...",REAL
4,Shutdown clash to return in force by December,"Notable names include Ray Washburne (Commerce), a Dallas-based investor, is reported to be under consideration to lead the department.",REAL


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   6335 non-null   object
 1   text    6335 non-null   object
 2   label   6335 non-null   object
dtypes: object(3)
memory usage: 148.6+ KB


In [10]:
df.duplicated().sum()

29

In [11]:
df = df.drop_duplicates()
df.duplicated().sum()

0

## **2. Prepare the Data** - Create 3 new columns:
- Tokenized texts: `just split the texts, don't remove stop words or punctuation`- Lemmatized texts: `remove stopwords, and punctuation, and lemmatize the words`
    - `IMPORTANT!` When you load in the SpaCy NLP object, <u>remember</u> to disable the parser and named object recognizer using the following: `spacy.load('en_core_web_sm, disable = ['parser', 'ner')`
- Joined lemmatized data.a
    -Join each lemmatized document into a single string.g.

<u> Tolkenize Texts:

In [23]:
df['tokens'] = df['text'].map(lambda doc: doc.lower().split())
df.head()

Unnamed: 0,title,text,label,tokens
0,A whirlwind day in D.C. showcases Trump’s unorthodox views and shifting tone,"Donald Trump endorsed an unabashedly noninterventionist approach to world affairs Monday during a day-long tour of Washington, casting doubt on the need for the North Atlantic Treaty Organization and expressing skepticism about a muscular U.S. mi...",REAL,"[donald, trump, endorsed, an, unabashedly, noninterventionist, approach, to, world, affairs, monday, during, a, day-long, tour, of, washington,, casting, doubt, on, the, need, for, the, north, atlantic, treaty, organization, and, expressing, skep..."
1,"In Baltimore's call for federal police probe, a new search for answers (+video)","While some Justice Department investigations are adversarial, a new model of collaborative reform is surprising police in some cities, as they find themselves included as part of the solution.\n\nSearching for a ""framework ... [to] heal,"" Baltimo...",REAL,"[while, some, justice, department, investigations, are, adversarial,, a, new, model, of, collaborative, reform, is, surprising, police, in, some, cities,, as, they, find, themselves, included, as, part, of, the, solution., searching, for, a, ""fra..."
2,Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It,"Trump Proudly Declares: Most Of The People I’ve Insulted Deserved It By Andrew Bradford on October 27, 2016 Subscribe \nArrogance is defined as “an insulting way of thinking or behaving that comes from believing that you are better, smarter, or m...",FAKE,"[trump, proudly, declares:, most, of, the, people, i’ve, insulted, deserved, it, by, andrew, bradford, on, october, 27,, 2016, subscribe, arrogance, is, defined, as, “an, insulting, way, of, thinking, or, behaving, that, comes, from, believing, t..."
3,Inside the Trump-Bush melodrama: Decades of tension and discomfort,"Donald Trump spent a day in January 2014 hobnobbing with politicians at the Trump International Golf Club in West Palm Beach, Fla. The billionaire mogul touted legalizing gambling with state Rep. Steve Crisafulli, speaker of the Florida House, an...",REAL,"[donald, trump, spent, a, day, in, january, 2014, hobnobbing, with, politicians, at, the, trump, international, golf, club, in, west, palm, beach,, fla., the, billionaire, mogul, touted, legalizing, gambling, with, state, rep., steve, crisafulli,..."
4,Shutdown clash to return in force by December,"Notable names include Ray Washburne (Commerce), a Dallas-based investor, is reported to be under consideration to lead the department.",REAL,"[notable, names, include, ray, washburne, (commerce),, a, dallas-based, investor,, is, reported, to, be, under, consideration, to, lead, the, department.]"


<u>Lemmatized Texts:

In [24]:
nlp_lite = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp_lite

<spacy.lang.en.English at 0x1e876f3e650>

In [None]:
df['lemmatized_tokens'] = df['text'].map(lambda doc: batch_preprocess_texts(doc, disable=['parser', 'ner'], nlp=nlp_lite
                                                                        ,use_lemmas=True))
df.head()

8810it [00:43, 200.30it/s]
6812it [00:40, 168.31it/s]
2339it [00:34, 67.82it/s] 
11388it [01:00, 188.70it/s]
134it [00:32,  4.18it/s]
1109it [00:40, 27.34it/s] 
1102it [00:44, 25.01it/s] 
8400it [00:26, 314.13it/s] 
567it [00:16, 33.59it/s] 
3283it [00:19, 168.89it/s] 
769it [00:16, 46.31it/s] 
8498it [00:25, 336.09it/s] 
3456it [00:19, 174.74it/s] 
10786it [00:25, 422.64it/s] 
6537it [00:20, 311.56it/s] 
2240it [00:17, 128.30it/s] 
9669it [00:24, 402.47it/s] 
2016it [00:16, 121.35it/s] 
6733it [00:21, 310.28it/s] 
5423it [00:20, 270.10it/s] 
403it [00:15, 25.85it/s]
4702it [00:20, 232.12it/s] 
4249it [00:20, 210.00it/s] 
7762it [00:21, 356.99it/s] 
4699it [00:19, 236.41it/s] 
3442it [00:19, 179.58it/s] 
2649it [00:17, 150.48it/s] 
2332it [00:17, 130.15it/s] 
7822it [00:22, 353.28it/s] 
7482it [00:22, 333.63it/s] 
40351it [00:51, 777.46it/s] 
3841it [00:19, 199.80it/s] 
7201it [00:21, 328.80it/s] 
4464it [00:19, 233.36it/s] 
5778it [00:21, 270.56it/s] 
4728it [00:20, 228.00it/s] 
3520i

## **3. Analyze class balance and document lengths:**
- What is the class balance? How many real and fake articles are there?
- What is the average word count for real news articles? What about fake ones?
- `Hint`, you can map the len() function to the tolkenized text to create a new column, then find the average of that column.

## **4. Compare the word frequencies:**
- Create and plot the frequency distribution plots for the 20 most common words in real and fake news articles `(2 total plots)`
    - Use the lemmatized text.
- Create word clouds for each of the article types, real and fake `(2 total word clouds)`
    - Use the joined lemmatized text

In [None]:
dist = FreqDist(tokens)

### <u>Plot 1

In [None]:
# Plotting the Fake News
dist.plot(20, title='Fake News Data')

### <u>WordCloud 1

In [None]:
# Create an instance of a WordCloud and use the generate method
cloud = WordCloud(random_state = 123).generate(txt)
plt.imshow(cloud);
plt.axis('off');

<u>Stopwords:

In [None]:
from wordcloud import STOPWORDS
STOPWORDS

In [None]:
# define custom stopwords
custom_stopwords = ["said",'Trump','s', *STOPWORDS]

In [None]:
# Create an instance of a WordCloud and use the generate method
cloud = WordCloud(random_state=123, stopwords=custom_stopwords).generate(txt)
plt.imshow(cloud);
plt.axis('off');

<u>Colormap

In [None]:
# Use a different color map
cloud = WordCloud(
    random_state=123,
    stopwords = custom_stopwords,
    colormap="plasma"
).generate(txt)
plt.imshow(cloud)
plt.axis("off");

In [None]:
# Create an instance of a WordCloud and use the generate method
cloud = WordCloud(
    random_state=123,
    stopwords = custom_stopwords,
    background_color="white",
    colormap="plasma",
).generate(txt)
plt.figure(figsize = (10, 5))
plt.imshow(cloud)
plt.axis("off");

In [None]:
# Create an instance of a WordCloud and use the generate method
cloud = WordCloud(
    random_state=123,
    stopwords = custom_stopwords,
    background_color="white",
    width = 500,
    height = 500,
    max_words=200,
    colormap="plasma",
    min_word_length=2,
).generate(txt)
plt.figure(figsize = (10, 10))
plt.imshow(cloud)
plt.axis("off");

In [None]:
# Add a title
cloud = WordCloud(random_state=123, 
                  stopwords=stops, 
                  colormap = 'coolwarm',
                  min_word_length = 3,
                  width = 800,
                  height = 400
).generate(txt)
plt.figure(figsize = (10, 5))
plt.title('False News')
plt.imshow(cloud);
plt.axis('off');

### <u>Plot 2

In [None]:
# Plotting the Fake News
dist.plot(20, title='Real News Data')

### <u>WordCloud 2

<u>Stopwords:

<u>Colormap

In [None]:
# Determine the length of each tweet
df['length'] = df['text'].map(len)
df.head(10)