# MSADS509 Final Project Data Cleaning, Tokenizing, and Normalizing

In [1]:
import pandas as pd
import re

from nltk.corpus import stopwords
from string import punctuation

from collections import Counter, defaultdict

import glob
from collections import Counter

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /Users/UE/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load data from desktop MSADS509_News_Project_Dataset folder

In [2]:
file_list = glob.glob('/Users/UE/Desktop/MSADS509_News_Project_Dataset/news_*.csv')

# Initialize an empty list to store DataFrames
df = []

# Iterate over each file, read it into a DataFrame, and append it to the list
for file in file_list:
    df.append(pd.read_csv(file))

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(df, ignore_index=True)
combined_df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/16/politics/russia...,CNN — Russia is trying to develop a nuclear sp...
1,cnn,https://www.cnn.com/2024/02/15/politics/takeaw...,CNN — The Georgia election subversion case aga...
2,cnn,https://www.cnn.com/2024/02/16/politics/biden-...,Washington CNN — The Norfolk Southern train de...
3,cnn,https://www.cnn.com/2024/02/16/politics/gaetz-...,CNN — The House Ethics Committee investigating...
4,cnn,https://www.cnn.com/2024/02/16/politics/takeaw...,CNN — Judge Arthur Engoron hit Donald Trump wi...
...,...,...,...
348,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
349,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
350,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
351,foxnews,https://www.foxnews.com/politics/trumps-nato-c...,close Video The media doesn’t allow the public...


In [3]:
# Remove duplicates

df = combined_df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/16/politics/russia...,CNN — Russia is trying to develop a nuclear sp...
1,cnn,https://www.cnn.com/2024/02/15/politics/takeaw...,CNN — The Georgia election subversion case aga...
2,cnn,https://www.cnn.com/2024/02/16/politics/biden-...,Washington CNN — The Norfolk Southern train de...
3,cnn,https://www.cnn.com/2024/02/16/politics/gaetz-...,CNN — The House Ethics Committee investigating...
4,cnn,https://www.cnn.com/2024/02/16/politics/takeaw...,CNN — Judge Arthur Engoron hit Donald Trump wi...
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/senate-foreig...,close Video Fate of foreign aid bill remains u...
236,foxnews,https://www.foxnews.com/politics/white-house-s...,close Video President Biden on release of clas...
237,foxnews,https://www.foxnews.com/politics/house-gop-tes...,close Video Jean-Pierre defends Biden’s mental...
238,foxnews,https://www.foxnews.com/politics/biden-garners...,close Video It's very awkward to watch Biden t...


# Data Cleaning, Tokenizing, and Normalizing

In [4]:
# Function to remove prefixes

def remove_prefix(row):
    if row['source'] == 'cnn' and row['content'].startswith('CNN — '):
        return row['content'][6:]  
    elif row['source'] == 'cnn' and row['content'].startswith('(CNN) — '):
        return row['content'][8:]  
    elif row['source'] == 'cnn' and row['content'].startswith('Washington CNN — '):
        return row['content'][17:]  
    elif row['source'] == 'cnn' and row['content'].startswith('New York CNN — '):
        return row['content'][15:]
    elif row['source'] == 'cnn' and row['content'].startswith('Plainview, New York CNN — '):
        return row['content'][26:]
    elif row['source'] == 'foxnews' and row['content'].startswith('close Video '):
        return row['content'][12:]  
    if row['source'] == 'foxnews' and row['content'].startswith('What\'s Happening?'):
        return row['content'][17:]  
    else:
        return row['content']

df['content'] = df.apply(remove_prefix, axis=1)

df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/16/politics/russia...,Russia is trying to develop a nuclear space we...
1,cnn,https://www.cnn.com/2024/02/15/politics/takeaw...,The Georgia election subversion case against D...
2,cnn,https://www.cnn.com/2024/02/16/politics/biden-...,The Norfolk Southern train derailment that sen...
3,cnn,https://www.cnn.com/2024/02/16/politics/gaetz-...,The House Ethics Committee investigating Rep. ...
4,cnn,https://www.cnn.com/2024/02/16/politics/takeaw...,Judge Arthur Engoron hit Donald Trump with his...
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/senate-foreig...,Fate of foreign aid bill remains uncertain aft...
236,foxnews,https://www.foxnews.com/politics/white-house-s...,President Biden on release of classified docs ...
237,foxnews,https://www.foxnews.com/politics/house-gop-tes...,Jean-Pierre defends Biden’s mental fitness and...
238,foxnews,https://www.foxnews.com/politics/biden-garners...,It's very awkward to watch Biden try to engage...


## Removing Unwanted Sentences

In [5]:
def remove_first_sentence(row):
    # Split the content into sentences based on '.', '?', and '!'
    sentences = re.split(r'(?<=[.!?]) +', row['content'])

    updated_content = row['content']
    
    if len(sentences) > 1:  
        first_sentence = sentences[0] 
      
        if 'Welcome to Fox News' in first_sentence:
            updated_content = ' '.join(sentences[2:])          
        elif 'A version of this story appeared' in first_sentence:
            updated_content = ' '.join(sentences[2:])
          
    return updated_content

def remove_first_sentence2(row):
    # Split the content into sentences based on '.', '?', and '!'
    sentences = re.split(r'(?<=[.!?]) +', row['content'])
    
    updated_content = row['content']
    
    if len(sentences) > 1: 
        first_sentence = sentences[0]  
            
    # Remove sentences containing the phrase
    updated_sentences = [sentence for sentence in sentences if 'FOX NEWS APP' not in sentence]
    updated_sentences = [sentence for sentence in sentences if 'Foxnews.com' not in sentence]
    updated_sentences = [sentence for sentence in sentences if 'Getty Images' not in sentence]
    updated_sentences = [sentence for sentence in sentences if 'CLICK HERE TO GET THE FOX NEWS APP' not in sentence]
    
    # Join the updated sentences back into content
    updated_content = ' '.join(updated_sentences)
    
    return updated_content


df['content'] = df.apply(remove_first_sentence, axis=1)
df['content'] = df.apply(remove_first_sentence2, axis=1)
df['content'] = df.apply(remove_prefix, axis=1)

df


Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/16/politics/russia...,Russia is trying to develop a nuclear space we...
1,cnn,https://www.cnn.com/2024/02/15/politics/takeaw...,The Georgia election subversion case against D...
2,cnn,https://www.cnn.com/2024/02/16/politics/biden-...,The Norfolk Southern train derailment that sen...
3,cnn,https://www.cnn.com/2024/02/16/politics/gaetz-...,The House Ethics Committee investigating Rep. ...
4,cnn,https://www.cnn.com/2024/02/16/politics/takeaw...,Judge Arthur Engoron hit Donald Trump with his...
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/senate-foreig...,Fate of foreign aid bill remains uncertain aft...
236,foxnews,https://www.foxnews.com/politics/white-house-s...,President Biden on release of classified docs ...
237,foxnews,https://www.foxnews.com/politics/house-gop-tes...,Jean-Pierre defends Biden’s mental fitness and...
238,foxnews,https://www.foxnews.com/politics/biden-garners...,It's very awkward to watch Biden try to engage...


In [6]:
# Remove image info

# Define a regular expression pattern to match content inside parentheses
pattern = r'\s*\([^)]*\)'

# Replace content inside parentheses with an empty string
df['content'] = df['content'].str.replace(pattern, '', regex=True)

In [7]:
def remove_last_sentence(row):
    sentences = row['content'].split('. ')
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        last_sentence = sentences[-1]  # Get the last sentence
        
        if ('This story has been updated with additional information.' in last_sentence or
            'contributed to this' in last_sentence or
            'will be updated' in last_sentence or
            'have been updated' in last_sentence or
            'APP Fox News' in last_sentence or
            'Fox News' in last_sentence or
            'FoxNews.com' in last_sentence or
            '@Fox.com' in last_sentence or
            'Fox News Digital' in last_sentence or
            'Fox News Channel and FOX Business' in last_sentence or

            'This story has been updated with additional reaction' in last_sentence or
            'This report has been updated with additional information' in last_sentence or
            'who covers politics' in last_sentence or
            'follow him on' in last_sentence or
            'Follow him on' in last_sentence or
            '@fox.com' in last_sentence or
            '@Fox.com' in last_sentence or
            'FoxNews.com' in last_sentence or
            'Fox News Digital' in last_sentence or
            'contributed to this' in last_sentence or
            'Fox News Politics newsletter' in last_sentence or
            'Fox News Digital' in last_sentence or
            'email' in last_sentence):
                
            updated_content = '. '.join(sentences[:-1])  # Join all sentences except the last one
            return updated_content
    
    return row['content']

df['content'] = df.apply(remove_last_sentence, axis=1)
df['content'] = df.apply(remove_last_sentence, axis=1)
df['content'] = df.apply(remove_last_sentence, axis=1)
df['content'] = df.apply(remove_last_sentence, axis=1)

df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/16/politics/russia...,Russia is trying to develop a nuclear space we...
1,cnn,https://www.cnn.com/2024/02/15/politics/takeaw...,The Georgia election subversion case against D...
2,cnn,https://www.cnn.com/2024/02/16/politics/biden-...,The Norfolk Southern train derailment that sen...
3,cnn,https://www.cnn.com/2024/02/16/politics/gaetz-...,The House Ethics Committee investigating Rep. ...
4,cnn,https://www.cnn.com/2024/02/16/politics/takeaw...,Judge Arthur Engoron hit Donald Trump with his...
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/senate-foreig...,Fate of foreign aid bill remains uncertain aft...
236,foxnews,https://www.foxnews.com/politics/white-house-s...,President Biden on release of classified docs ...
237,foxnews,https://www.foxnews.com/politics/house-gop-tes...,Jean-Pierre defends Biden’s mental fitness and...
238,foxnews,https://www.foxnews.com/politics/biden-garners...,It's very awkward to watch Biden try to engage...


## Standardizing Entity Names

In [8]:
# Combine specified word pairs

df['content'] = df['content'].str.replace(r'\bHunter\s+Biden\b', 'HunterBiden', regex=True)
df['content'] = df['content'].str.replace(r'\bHUNTER\s+Biden\b', 'HunterBiden', regex=True)
df['content'] = df['content'].str.replace(r'\bSouth\s+Carolina\b', 'SouthCarolina', regex=True)
df['content'] = df['content'].str.replace(r'\bSupreme\s+Court\b', 'SupremeCourt', regex=True)
df['content'] = df['content'].str.replace(r'\bsupreme\s+court\b', 'SupremeCourt', regex=True)
df['content'] = df['content'].str.replace(r'\bCourt\s+House\b', 'CourtHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bcourt\s+house\b', 'CourtHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bHouse\s+Representative\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bhouse\s+representative\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bHouse\s+Rep\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bhouse\s+rep\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bvoters\b', 'voter', regex=True)
df['content'] = df['content'].str.replace(r'\bvotes\b', 'vote', regex=True)
df['content'] = df['content'].str.replace(r'\bdemocratic(?:s)?\b', 'Democrat', case=False, regex=True)
df['content'] = df['content'].str.replace(r'\bDemocrats\b', 'Democrat', regex=True)
df['content'] = df['content'].str.replace(r'\brepublicans\b', 'Republican', regex=True)
df['content'] = df['content'].str.replace(r'\bRepublicans\b', 'Republican', regex=True)
df['content'] = df['content'].str.replace(r'\bwhite\s+house\b', 'WhiteHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bWhite\s+house\b', 'WhiteHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bNew\s+York\b', 'NewYork', regex=True)

In [9]:
# Define variations of Biden's name
biden_variations = df['content'].str.findall(
    r'\bPresident\s+Joe\s+Biden\b|'  
    r'\bPresident\s+Biden\b|'         
    r'\bJoe\s+Biden(?:’s)?\b|'             
    r'\bBiden(?:’s|s)?\b|'  
    r'\bBIDEN\b|' 
    r'\bBiden\'s\b'                  
)
# Flatten the list of variations
biden_variations = [item for sublist in biden_variations for item in sublist]

# Count occurrences of each variation
biden_variation_counts = Counter(biden_variations)

# Replace variations of Biden's name with 'Biden' in the content column
df['content'] = df['content'].str.replace(
    r'\bPresident\s+Joe\s+Biden\b|'  
    r'\bPresident\s+Biden\b|'         
    r'\bJoe\s+Biden(?:’s)?\b|'             
    r'\bBiden(?:’s|s)?\b|'  
    r'\bBIDEN\b|' 
    r'\bBiden\'s\b'    
    , 'Biden', regex=True)

print("Occurrences of different variations of Biden's name:")
for variation, count in biden_variation_counts.items():
    print(f"{variation}: {count}")

Occurrences of different variations of Biden's name:
President Joe Biden: 116
Biden: 861
Biden’s: 156
President Biden: 159
Bidens: 23
Joe Biden: 137
Joe Biden’s: 13
BIDEN: 105
President Joe Biden: 1


In [10]:
# Count occurrences of 'Biden' after replacement
biden_count_after = df['content'].str.count('Biden').sum()

print("Occurrences of Biden after replacement:", biden_count_after)

Occurrences of Biden after replacement: 1709


In [11]:
# Find all variations of Trump's name in the content column
trump_variations = df['content'].str.findall(
    r'\bPresident\s+Donald\s+Trump\b|'  
    r'\bPresident\s+Trump\b|'         
    r'\bDonald\s+Trump(?:’s)?\b|'             
    r'\bTrump(?:’s)?\b|'   
    r'\bTRUMP(?:’S)?\b|'  
    r'\bFormer\s+President\s+Donald\s+Trump\b|' 
    r'\bDonald\s+J(?:ohn)?\s+Trump\b'            
)

# Flatten the list of variations
trump_variations = [item for sublist in trump_variations for item in sublist]

# Count occurrences of each variation
trump_variation_counts = Counter(trump_variations)

# Replace variations of Trump's name with 'Trump' in the content column
df['content'] = df['content'].str.replace(
    r'\bPresident\s+Donald\s+Trump\b|'  
    r'\bPresident\s+Trump\b|'         
    r'\bDonald\s+Trump(?:’s)?\b|'             
    r'\bTrump(?:’s)?\b|'   
    r'\bTRUMP(?:’S)?\b|'  
    r'\bFormer\s+President\s+Donald\s+Trump\b|' 
    r'\bDonald\s+J(?:ohn)?\s+Trump\b'               
    , 'Trump', regex=True) 

print("Occurrences of different variations of Trump's name:")
for variation, count in trump_variation_counts.items():
    print(f"{variation}: {count}")

Occurrences of different variations of Trump's name:
Donald Trump: 90
President Donald Trump: 78
Trump’s: 380
Trump: 1259
Former President Donald Trump: 32
Donald Trump’s: 15
President Trump: 81
President Trump: 1
TRUMP: 35
TRUMP’S: 2
Donald Trump: 1


In [12]:
# Count occurrences of 'Trump' after replacement
trump_count_after = df['content'].str.count('Trump').sum()

print("Occurrences of Trump after replacement:", trump_count_after)

Occurrences of Trump after replacement: 1977


## Data Preprocessing Pipeline

In [13]:

punctuation = set(punctuation) # speeds up comparison
sw = stopwords.words("english")
extra_sw = ['cnn', 'fox', 'news', 'said', '–', '-', '--', '—','told', 'would', '…read', 'get', 'could', 
            'also', "it’s", 'think', 'time', 'even', 'former', 'party', 'i', '“i', 'she’s', 'says', 
            'images', 'getty', 'im', 'this', 'we', 'it', 'digital', 'the', 'that', 'story', 'doesn']
sw.extend(extra_sw)
whitespace_pattern = re.compile(r"\s+")

def remove_stop(tokens) :
    
    return [t for t in tokens if t.lower() not in sw]

def remove_punctuation(text, punct_set=punctuation) : 
    
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    
    return re.split(whitespace_pattern, text)

def prepare(text, pipeline) : 
    
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]

In [14]:
# Tokenize and preprocess each row
df['tokens'] = df['content'].apply(lambda x: prepare(x, pipeline=pipeline))

df.head()

Unnamed: 0,source,url,content,tokens
0,cnn,https://www.cnn.com/2024/02/16/politics/russia...,Russia is trying to develop a nuclear space we...,"[russia, trying, develop, nuclear, space, weap..."
1,cnn,https://www.cnn.com/2024/02/15/politics/takeaw...,The Georgia election subversion case against T...,"[georgia, election, subversion, case, trump, 1..."
2,cnn,https://www.cnn.com/2024/02/16/politics/biden-...,The Norfolk Southern train derailment that sen...,"[norfolk, southern, train, derailment, sent, t..."
3,cnn,https://www.cnn.com/2024/02/16/politics/gaetz-...,The House Ethics Committee investigating Rep. ...,"[house, ethics, committee, investigating, rep,..."
4,cnn,https://www.cnn.com/2024/02/16/politics/takeaw...,Judge Arthur Engoron hit Trump with his bigges...,"[judge, arthur, engoron, hit, trump, biggest, ..."


# Basic Descriptive Statistics

In [15]:
def descriptive_stats(tokens, num_tokens = 50, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))  
    lexical_diversity = num_unique_tokens / num_tokens
    num_characters = sum(len(s) for s in tokens)
    
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")        
        print (f"\nThe ten most common words are:\n")
        print(Counter(tokens).most_common(10))
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])

In [16]:
# calls to descriptive_stats here

print("CNN News Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'cnn']['tokens']for token in tokens])

print('\n')
print("FoxNews Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'foxnews']['tokens']for token in tokens])

CNN News Stats

There are 76925 tokens in the data.
There are 11339 unique tokens in the data.
There are 507287 characters in the data.
The lexical diversity is 0.147 in the data.

The ten most common words are:

[('trump', 1527), ('biden', 741), ('republican', 560), ('house', 459), ('us', 411), ('president', 402), ('election', 396), ('democrat', 360), ('case', 284), ('campaign', 254)]


FoxNews Stats

There are 40181 tokens in the data.
There are 7106 unique tokens in the data.
There are 261674 characters in the data.
The lexical diversity is 0.177 in the data.

The ten most common words are:

[('biden', 679), ('house', 403), ('trump', 330), ('republican', 245), ('president', 224), ('democrat', 193), ('us', 187), ('senate', 167), ('security', 155), ('special', 153)]


[40181, 7106, 0.17684975485926185, 261674]

In [17]:
# save df for next step

df.to_csv('/Users/UE/Desktop/MSADS509_News_Project_Dataset/cleaned.csv', index=False)