In [1]:
import glob
import os
import pandas as pd 
import re
from pandas import Series, DataFrame
from textblob import TextBlob, Word
import nltk
import re
nltk.download('stopwords') 
from nltk.corpus import stopwords
from nltk.util import ngrams
import string
from nltk.probability import FreqDist
import gensim
from gensim import models # For TF-IDF, LDA
import swifter # Makes applying to datframe as fast as vectorizing
import numpy as np
import collections
import csv

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

# LDA Visualization
import pyLDAvis
import pyLDAvis.gensim 


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/schandrasekharan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# ------------------- FUNCTIONS ---------------------------------------------------------

# Pre-procesing function

stop_words = stopwords.words('english')

stop_words.extend(['chars', 'char']) # Add from blacklist

stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'])

# stop_words.extend(['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'])
# 

stop_words.extend(['get', 'say', 'gmt', 'dont', 'make', 'want', 'also', 
                   'take', 'since', 'tell', 'like', 'could', 'would', 
                   'should', 'jsfjsdgetelementsbytagnames0p', 'functiondsidvar']) # Adding from LDA topics 

def preprocess_text(doc):
    
    """
    Pre-processing using TextBlob: 
    tokenizing, converting to lower-case, and lemmatization based on POS tagging, 
    removing stop-words, and retaining tokens greater than length 2
    
    We also include n_grams (n = 1,2,3,4) in the final output
    
    Argument(s): 'doc' - a string of words or sentences.
    
    Output: 'reuslt' - a list of pre-processed tokens of each sentence in 'doc'
    
    """
    
    blob = TextBlob(doc).lower() 
    
    result = []
    tag_dict = {"J": 'a', # Adjective
                "N": 'n', # Noun
                "V": 'v', # Verb
                "R": 'r'} #  Adverb
    
    # For all other types of parts of speech (including those not classified at all) 
    # the tag_dict object maps to 'None'
    # the method w.lemmatize() defaults to 'Noun' as POS for those classified as 'None'
    
    
    bigrams = blob.ngrams(n = 2)
    trigrams = blob.ngrams(n = 3)
    fourgrams = blob.ngrams(n = 4)

    for sent in blob.sentences:

        words_and_tags = [(w, tag_dict.get(pos[0])) for w, pos in sent.tags]
        lemmatized_list = [w.lemmatize(tag) for w, tag in words_and_tags]

        for token in lemmatized_list:
            if token not in stop_words and len(token.lower()) > 2:
                result.append(token.lower())

                
    return result + ['_'.join(i) for i in bigrams] + ['_'.join(i) for i in trigrams] + ['_'.join(i) for i in fourgrams]

In [3]:
doc = "Natural Language Processing (NLP) is an area of growing attention due to increasing number of applications like chatbots, machine translation etc. In some ways, the entire revolution of intelligent machines in based on the ability to understand and interact with humans. I have been exploring NLP for some time now.  My journey started with NLTK library in Python, which was the recommended library to get started at that time. NLTK is a perfect library for education and research, it becomes very heavy and tedious for completing even the simple tasks."

# preprocess_text(doc)


In [4]:
# Function for word frequencies

def get_frequency(processed_text_list): 

    """
    Using a built-in NLTK function that generates tuples
    We get the frequency distribution of all words/n-grams in a tokenized list
    We also sort these frequencies in descending order in a dictionary object.
    
    Argument(s): 'processed_text_list' - A list of pre-processed tokens
    
    Output: sorted_counts - A dictionary of tokens and their respective counts in descending order
    """

    word_frequency = FreqDist(word for word in processed_text_list)

    sorted_counts = sorted(word_frequency.items(), key = lambda x: x[1], reverse = True)

    return dict(sorted_counts)


In [5]:
# get_frequency(preprocess_text(doc))

In [6]:
def merge_vocab_dictionary(vocab_column, name = 'test_file'):
    
    """
    Takes any number of vocabulary frequency dictionaries 
    (here, all from 1 column) and merges them while summing 
    the respective frequencies and saves this merged dictionary
    to a text file
    
    
    Argument(s): vocab_column - A column of dictionary objects
                 name (string object) - the name to be given to the text file
    Output(s): a list object containing all the frequency dictionaries
               a saved text file containing all the dictionary elements
               a saved CSV file containing all the dictionary elements
    
    """
    
    merged_dict = {}
    for dictionary in vocab_column:
        for key, value in dictionary.items():  # d.items() in Python 3+
#             merged_dict[key].append(value)
            merged_dict.setdefault(key, []).append(value)


    for key, value in merged_dict.items():
        merged_dict[key] = sum(value)
    
    
#     name = 'Apr_01_2019_window_1' 
    save_name_txt = name + '.txt'
    save_name_csv = name + '.csv'
    
    with open(save_name_txt, 'w+', encoding = "utf-8") as file1:
        for key, value in merged_dict.items():
            file1.write("%s\n" % f"{key}: {value}")
            

    with open(save_name_csv, 'w+') as file2:
        file2.write("%s,%s\n"%('Token', 'Frequency'))
        for key, value in merged_dict.items():
            file2.write("%s,%s\n"%(key, value)) # ----> No headers
       
#     with open(save_name_csv, mode = 'w', newline = '') as file2:
#         writer = csv.DictWriter(save_name_csv, fieldnames = ['Token', 'Frequency'])
#         writer.writeheader()
#         for key, value in merged_dict.items():
#             writer.writerow({key: value}) # ---> Throws an error.
     
    return merged_dict
    

In [8]:
# test_thing = merge_vocab_dictionary(data_text['short_total_frequencies'])

In [9]:
# test_thing

In [25]:
def blacklist(processed_tokens_list):
    
    """
    Outputs the tokens that have been filtered out using the gensim dictionary
    
    Argument(s): 'processed_tokens_list' - a list of pre-processed tokens
    Output: 'blacklist' - a list of blacklisted tokens from a given list
    """
    
    blacklist = []
    for token in processed_tokens_list:
        if token not in short_text_dictionary.token2id.keys():
            blacklist.append(token)
        
    return blacklist


def retained(processed_tokens_list):
    
    """
    Outputs the tokens that have been retained after filtration using the gensim dictionary
    
    Argument(s): 'processed_tokens_list' - a list of pre-processed tokens
    Output: 'retained' - a list of retained tokens from a given list
    """
    
    retained = []
    for token in processed_tokens_list:
        if token in short_text_dictionary.token2id.keys():
            retained.append(token)
        
    return retained

In [None]:
# directory = "C:/Users/Shreya/Desktop/Threat_detective/all un news/"

# for files in glob.glob(directory + '*.csv'):

#     print(files[53:]) #23

In [None]:
directory = "/Users/schandrasekharan/Desktop/Shreya_Personal/"
print(len(directory))

In [None]:
import os
cwd = os.getcwd()
cwd

In [None]:
save_directory = '/Users/schandrasekharan/Desktop/Shreya_Personal/'
os.chdir(save_directory)
cwd = os.getcwd()
cwd

In [11]:
# # ------------------- PRE-PROCESS ARTICLES ---------------------------------------------------------

print("STEP 1")

data_all = pd.read_csv("/Users/schandrasekharan/Desktop/Shreya_Personal/AllContent2019-04-01.csv")

data_text = data_all.copy()

data_text['old_index'] = data_text.index

print("Total number of short and long articles is: ", len(data_text))

data_text = data_text.dropna(subset = ["content", "full-content"]) # Not all articles have any content available
print("Total number of short and long articles after dropping blank ones: ", len(data_text))

data_text = data_text.drop_duplicates(subset=["title", "description"], keep = 'last') # ---> Read below

"""
NOTE: The argument 'keep' changes the type of articles retained

keep = 'last' ---> Ensures that the latest article is retained 
       (there might be changes in the content over time - as indicated by higher 
       retention of aritcle when 'url' is included in the subset.)
       However, this leads to more AP sources
       
keep = 'first' ---> Retains original source more often.

# We have many repeating articles
# Dropping duplicates based on article title and description
# Including "source_id" as a duplicate subset leads to fewer drops --> same article, different sources
# Keeping least recent article (based on 'publishedAt')

"""

print("Total number of unique short and long articles is: ", len(data_text))

short_remove_special_characters = re.compile(r'([^\w\s-]|_)+')
data_text[['content', 'full-content']].replace(to_replace= short_remove_special_characters, value=' ', regex=True, inplace=True)

print("STEP 1 COMPLETE")


# For LONG (full) articles 

print("STEP 2")

# Remove hyperlinks from content
long_link_remove = re.compile(r'http\S+')                             
long_remove_let_your_friends_know = re.compile(r'Let friends in your social network know what you are reading about .*? Please read the rules before joining the discussion.')
long_remove_last_for_more_coverage_1 = re.compile(r'___ For more .*? This material may not be published, broadcast, rewritten or redistributed.')
long_remove_last_for_more_coverage_2 = re.compile(r'___ For more .*? by Automated Insights,  using data from STATS LLC, ')
long_remove_last_for_more_coverage_3 = re.compile(r'For more AP.*? by Automated Insights,  using data from STATS LLC, ')

data_text['full-content'].replace(to_replace= [long_link_remove, long_remove_let_your_friends_know, long_remove_last_for_more_coverage_1, long_remove_last_for_more_coverage_2, long_remove_last_for_more_coverage_3], value='', regex=True, inplace=True)


print("STEP 2 COMPLETE")
# data_text.head()


# We will apply preprocessing to the whole dataframe 

# First Short

print("STEP 3")

data_text['short_processed_text'] = data_text['content'].swifter.apply(preprocess_text)
# Then Long
data_text['long_processed_text'] = data_text['full-content'].swifter.apply(preprocess_text)

print("STEP 3 COMPLETE")

STEP 1
Total number of short and long articles is:  4872
Total number of short and long articles after dropping blank ones:  3356
Total number of unique short and long articles is:  1498


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


STEP 1 COMPLETE
STEP 2
STEP 2 COMPLETE
STEP 3


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…


STEP 3 COMPLETE


In [12]:

print("STEP 4")

data_text['short_all_frequencies'] = data_text['short_processed_text'].swifter.apply(get_frequency)
data_text['long_all_frequencies'] = data_text['long_processed_text'].swifter.apply(get_frequency)
    
print("STEP 4 COMPLETE")
    

STEP 4


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…


STEP 4 COMPLETE


In [13]:
print("STEP 5")

short_full_vocab = merge_vocab_dictionary(data_text['short_all_frequencies'], name = 'short_Apr_01')
long_full_vocab = merge_vocab_dictionary(data_text['long_all_frequencies'], name = 'long_Apr_01')

print("STEP 5 COMPLETE")

STEP 5
STEP 5 COMPLETE


In [None]:
"""
UP TO THIS POINT IT IS GENERIC
WE CAN DO THIS FOR EACH FULL_NEWS FILE AND MERGE 
"""

In [15]:
# Gensim's in-built dictionary

short_text_dictionary = gensim.corpora.Dictionary(data_text.short_processed_text)
long_text_dictionary = gensim.corpora.Dictionary(data_text.long_processed_text)

"""
gensim has its own high and low pass filters as shown below.
"""

print("STEP 6 ")

# Include words in dictionary that appear greater than 5 times - Low pass
# but less than 0.4 proportion of the frequency of all the words in all of the articles - High pass
print("Total length of short content dictionary before filtering is: ", len(short_text_dictionary))
short_text_dictionary.filter_extremes(no_below = 5, no_above=0.4) 
print("Total length of short content dictionary after filtering is: ", len(short_text_dictionary))
print("Total length of long content dictionary before filtering is: ", len(long_text_dictionary))
long_text_dictionary.filter_extremes(no_below = 5, no_above=0.4) 
print("Total length of long content dictionary after filtering is: ", len(long_text_dictionary))

print("STEP 6 COMPLETE")

STEP 6 
Total length of short content dictionary before filtering is:  160184
Total length of short content dictionary after filtering is:  3138
Total length of long content dictionary before filtering is:  1589386
Total length of long content dictionary after filtering is:  44609
STEP 6 COMPLETE


In [22]:
data_text['short_blacklist_vocab'] = data_text['short_processed_text'].swifter.apply(blacklist)
data_text['long_blacklist_vocab'] = data_text['long_processed_text'].swifter.apply(blacklist)
data_text['short_retained_vocab'] = data_text['short_processed_text'].swifter.apply(retained)
data_text['long_retained_vocab'] = data_text['long_processed_text'].swifter.apply(retained)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




In [24]:
data_text['short_blacklist_frequencies'] = data_text['short_blacklist_vocab'].swifter.apply(get_frequency)
data_text['long_blacklist_frequencies'] = data_text['long_blacklist_vocab'].swifter.apply(get_frequency)
data_text['short_retained_frequencies'] = data_text['short_retained_vocab'].swifter.apply(get_frequency)
data_text['long_retained_frequencies'] = data_text['long_retained_vocab'].swifter.apply(get_frequency)
short_blacklist_vocab = merge_vocab_dictionary(data_text['short_blacklist_frequencies'], name = 'short_Apr_01_blacklist')
long_blacklist_vocab = merge_vocab_dictionary(data_text['long_blacklist_frequencies'], name = 'long_Apr_01_blacklist')
short_retained_vocab = merge_vocab_dictionary(data_text['short_retained_frequencies'], name = 'short_Apr_01_retained')
long_retained_vocab = merge_vocab_dictionary(data_text['long_retained_frequencies'], name = 'long_Apr_01_retained')

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1498, style=ProgressStyle(description_widt…




In [None]:
# TF-IDF depending on the time and filtering window

# long_main_corpus = [long_text_dictionary.doc2bow(doc) for doc in long_processed_docs]


# # TF-IDF on the bag of words corpus

# long_tfidf = models.TfidfModel(long_main_corpus)
# long_tfidf_main_corpus = tfidf[long_main_corpus]

In [None]:
##     --------------------------------------- SAVING PRE-PROCESSED TEXT TO FILE --------------------



# print("STEP 17")

print("STEP 18")

data_url = data_text['url']
data_source = data_text['source_name']
data_publish_time = data_text['publishedAt']
data_title = data_text['title']
data_description = data_text['description']
data_short_content = data_text['content']
data_full_content = data_text['full-content']
data_short_all_tokens = data_textx['short_processed_text']
data_long_all_tokens = data_text['long_processed_text']
data_short_all_frequencies = data_text['short_all_frequencies']
data_long_all_frequencies = data_text['short_all_frequencies']
data_short_blacklist_frequencies = data_text['short_blacklist_frequencies']
data_long_blacklist_frequencies = data_text['short_blacklist_frequencies']
data_short_retained_frequencies = data_text['short_all_frequencies']
data_long_retained_frequencies = data_text['short_retained_frequencies']


# """
# Use the script below to make changes to the CSV file and save as a different CSV file 
# """

# #     save_directory = "/home/shreyac/cleaned_news/"
# save_directory = '/Users/schandrasekharan/Desktop/Shreya_Personal'

# news_url = []
# news_source = []
# news_publish_time = []
# news_title = []
# news_description = []
# news_short_content = []
# news_full_content = []
# short_processed_tokens = []
# long_processed_tokens = []


# for i in range(0, len(data_text_index)):

#     news_url.append(data_url[i])
#     news_source.append(data_source[i])
#     news_publish_time.append(data_publish_time[i])
#     news_title.append(data_title[i])
#     news_description.append(data_description[i])
#     news_short_content.append(data_short_content[i])
#     news_full_content.append(data_full_content[i])
#     short_processed_tokens.append(data_short_tokens[i])
#     long_processed_tokens.append(data_long_tokens[i])


# news_file_df = DataFrame({'url': news_url,
#                 'source': news_source,
#                 'published_at': news_publish_time,
#                 'title': news_title,
#                 'description': news_description,
#                 'short_content': news_short_content,
#                 'full_content': news_full_content,
#                 'short_processed_tokens': short_processed_tokens,
#                 'long_processed_tokens': long_processed_tokens})

# news_file_df = news_file_df[['url', 'source', 'published_at', 'title',
#                                  'description', 'short_content', 'full_content',
#                              'short_processed_tokens', 'long_processed_tokens']]

# save_path = save_directory + name + '.csv'

# news_file_df.to_csv(save_path, index = None, header=True, encoding='utf-8')


# print("STEP 19")
# print(" ")
# print("Loop complete")
# print("Next")
# print(" ")
# print(" ")


