In [1]:
# %pip install pandarallel
# %pip install bs4
# %pip install ipywidgets
# %pip install nltk

## NLP Class Final Project Read Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
import bs4
import nltk.corpus  
from nltk.text import Text
from pandarallel import pandarallel
import multiprocessing
import warnings
warnings.filterwarnings('ignore')
import ipywidgets as widgets
widgets.IntSlider()

pd.set_option('max_colwidth', 200)

In [2]:
from IPython.display import clear_output
import nltk
nltk.download('popular', halt_on_error=False)
nltk.download('all', halt_on_error=False)
clear_output(wait=False)

### Get the workers initialized for multiprocessing

In [3]:
num_cpu = multiprocessing.cpu_count()

workers = num_cpu - 2

pandarallel.initialize(nb_workers = workers, progress_bar = True, use_memory_fs=False)

INFO: Pandarallel will run on 62 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [4]:
%%time

df_news_final_project = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
df_news_final_project.shape

CPU times: user 7.67 s, sys: 9.06 s, total: 16.7 s
Wall time: 20.4 s


(200332, 5)

In [5]:
df_news_final_project.head(3)

Unnamed: 0,url,date,language,title,text
0,http://en.people.cn/n3/2021/0318/c90000-9830122.html,2021-03-18,en,Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online,\n\nArtificial intelligence improves parking efficiency in Chinese cities - People's Daily Online\n\nHome\nChina Politics\nForeign Affairs\nOpinions\nVideo: We Are China\nBusiness\nMilitary\nWorld...
1,http://newsparliament.com/2020/02/27/children-with-autism-saw-their-learning-and-social-skills-boosted-after-playing-with-this-ai-robot/,2020-02-27,en,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament,"\nChildren With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament\n \n\nSkip to content\n\t\t\tThursday, February 27, 2020\t\t\n\nLatest:\n\n\n..."
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & Rework Solutions - Dataweek","\n\nForget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & Rework Solutions - Dataweek\nHome\nAbout us\nBack issues / E-book / PDF\nEMP Handbook\nSubscribe\..."


## Cleaning the Data

### Cleaning non textual columns

In [6]:
#Discard the URLs for the analysis
df = df_news_final_project.drop('url', axis=1)

#Get the dates in order, have columns for dates, years, months, quarters, days and day of the week
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['dayofwk'] = df['date'].dt.dayofweek
df['day'] = df['date'].dt.day
df['quarter'] = pd.PeriodIndex(df['date'], freq = "Q")

#Ensuring there will only be English articles
df = df[df['language']=='en']

#get the order of columns straightened out
cols = ['date','month','year','quarter','day','dayofwk','language','title','text']
df = df[cols]

df.head(3)

Unnamed: 0,date,month,year,quarter,day,dayofwk,language,title,text
0,2021-03-18,3,2021,2021Q1,18,3,en,Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online,\n\nArtificial intelligence improves parking efficiency in Chinese cities - People's Daily Online\n\nHome\nChina Politics\nForeign Affairs\nOpinions\nVideo: We Are China\nBusiness\nMilitary\nWorld...
1,2020-02-27,2,2020,2020Q1,27,3,en,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament,"\nChildren With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament\n \n\nSkip to content\n\t\t\tThursday, February 27, 2020\t\t\n\nLatest:\n\n\n..."
2,2021-03-26,3,2021,2021Q1,26,4,en,"Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & Rework Solutions - Dataweek","\n\nForget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & Rework Solutions - Dataweek\nHome\nAbout us\nBack issues / E-book / PDF\nEMP Handbook\nSubscribe\..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200332 entries, 0 to 200331
Data columns (total 9 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   date      200332 non-null  datetime64[ns]
 1   month     200332 non-null  int64         
 2   year      200332 non-null  int64         
 3   quarter   200332 non-null  period[Q-DEC] 
 4   day       200332 non-null  int64         
 5   dayofwk   200332 non-null  int64         
 6   language  200332 non-null  object        
 7   title     200332 non-null  object        
 8   text      200332 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(3), period[Q-DEC](1)
memory usage: 15.3+ MB


### Clean the text and title columns

In [8]:
def text_clean(text):
    """Cleans text from news articles.

      Args:
        text: The text to be cleaned.

      Returns:
        The cleaned text.
      """
    #Remove the symbols, new line characters, tab characters
    clean_text = re.sub(r'[.,]',' ',text)
    clean_text = re.sub(r'\n+|\t+|\||^\s+',' ', clean_text)
    #Remove the HTML remnant tags if any
    clean_text = bs4.BeautifulSoup(clean_text, "html.parser").get_text()
    #Remove non-ASCII characters
    clean_text = clean_text.encode("ascii","ignore").decode("ascii")
    #Remove punctuation
    clean_text = re.sub(r'[^a-zA-Z0-9 . , @ - _]',' ',clean_text)
    #Remove stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    clean_text = " ".join([word for word in clean_text.split() if word not in stopwords])
    #Remove single character words non printable characters
    clean_text = " ".join([word for word in clean_text.split() if len(word) >1])
    clean_text = " ".join([word for word in clean_text.split() if word.isprintable()])
    #Replace multiple spaces to single space
    clean_text = re.sub(r'\s+',' ',clean_text)
    
    return clean_text

In [9]:
%%time
df['title_clean'] = df['title'].parallel_apply(lambda x: text_clean(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3232), Label(value='0 / 3232'))), …

CPU times: user 1.11 s, sys: 4.45 s, total: 5.57 s
Wall time: 5.9 s


In [10]:
%%time
df['text_clean'] = df['text'].parallel_apply(lambda x: text_clean(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3232), Label(value='0 / 3232'))), …

CPU times: user 9.85 s, sys: 14.9 s, total: 24.8 s
Wall time: 24.3 s


In [12]:
df[['title','title_clean','text','text_clean']].head(3)

Unnamed: 0,title,title_clean,text,text_clean
0,Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online,Artificial intelligence improves parking efficiency Chinese cities People Daily Online,\n\nArtificial intelligence improves parking efficiency in Chinese cities - People's Daily Online\n\nHome\nChina Politics\nForeign Affairs\nOpinions\nVideo: We Are China\nBusiness\nMilitary\nWorld...,Artificial intelligence improves parking efficiency Chinese cities People Daily Online Home China Politics Foreign Affairs Opinions Video We Are China Business Military World Society Culture Trave...
1,Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament,Children With Autism Saw Their Learning Social Skills Boosted After Playing With This AI Robot News Parliament,"\nChildren With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Robot – News Parliament\n \n\nSkip to content\n\t\t\tThursday, February 27, 2020\t\t\n\nLatest:\n\n\n...","Children With Autism Saw Their Learning Social Skills Boosted After Playing With This AI Robot News Parliament Skip content Thursday, February 27, 2020 Latest Mansplaining conferences How get fore..."
2,"Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & Rework Solutions - Dataweek","Forget ML, AI Industry 4.0 obsolescence focus 26 February 2021 Test Rework Solutions Dataweek","\n\nForget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & Rework Solutions - Dataweek\nHome\nAbout us\nBack issues / E-book / PDF\nEMP Handbook\nSubscribe\...","Forget ML, AI Industry 4.0 obsolescence focus 26 February 2021 Test Rework Solutions Dataweek Home About us Back issues book PDF EMP Handbook Subscribe Advertise Categories Editor Choice Multimedi..."


### Tokenize the text from cleaned title and text columns 

In [13]:
%%time
df['title_tokens'] = df['title_clean'].parallel_apply(lambda x: nltk.tokenize.word_tokenize(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3232), Label(value='0 / 3232'))), …

CPU times: user 1.63 s, sys: 5.53 s, total: 7.16 s
Wall time: 7.44 s


In [14]:
%%time
df['text_tokens'] = df['text_clean'].parallel_apply(lambda x: nltk.tokenize.word_tokenize(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3232), Label(value='0 / 3232'))), …

CPU times: user 38.6 s, sys: 32.8 s, total: 1min 11s
Wall time: 1min 28s


In [15]:
df.columns

Index(['date', 'month', 'year', 'quarter', 'day', 'dayofwk', 'language',
       'title', 'text', 'title_clean', 'text_clean', 'title_tokens',
       'text_tokens'],
      dtype='object')

In [16]:
%%time
df.drop('language',axis =1).to_parquet('data_cleaned.parquet')

CPU times: user 40.5 s, sys: 8.96 s, total: 49.5 s
Wall time: 1min 9s


In [17]:
!jupyter nbconvert --to html 'NLP_Final_1.ipynb'

[NbConvertApp] Converting notebook NLP_Final_1.ipynb to html
[NbConvertApp] Writing 651344 bytes to NLP_Final_1.html


In [18]:
import datetime
import time
import pytz
datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Sun, 21 May 2023 18:15:52'