<center>
<span style="font-size: 36px;">NEWS SCRAPING & KEYWORDS</span>
</center>

Goal of this project : 
1. Scrape the news articles from webpage, organize data by normalization.
2. Analyze the article, grasp the insight by using external tool. (Tableau)

<center>
<span style="font-size: 26px;">STEP 1 : SCRAPING NEWS ARTICLES (CREATING DATABASE)</span>
</center>

In [181]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor

In [4]:
#response check url
url1 = "https://www.japantimes.co.jp/morearticles/world/?pgno=1"
requests.get(url1)

<Response [200]>

In [208]:
#News sections to scrape
section_list = ['japan', 'world', 'asia-pacific']

In [30]:
#get 'article class' under class 'jt-more-articles'
def get_html(session, section, page):
    url1 = f"https://www.japantimes.co.jp/morearticles/{section}/?pgno={page}"
    response = session.get(url1)
    htmltext = response.text
    orghtml = BeautifulSoup(htmltext, "html.parser")
    articles = orghtml.select(".jt-more-articles > .article")
    return articles


#Remove 'and','By','\n' and blanks from .article-byline to not remove by or and within author's name
def clean_names(input):
    cleaned_string = re.sub(r'\bBy\b', '', input, flags=re.IGNORECASE) #Remove 'By'
    cleaned_string = re.sub(r'\band\b', ',', cleaned_string, flags=re.IGNORECASE) #Replace 'and' with ','
    cleaned_string = cleaned_string.replace('\n','')
    cleaned_string = cleaned_string.strip()
    return cleaned_string

#get author,date,division,title,summary for each articles and clean it 
def html_seperation(article,newslist):
    title_element = article.select_one(".article-title > a").text
    division_element = article.select_one(".article-section").text.replace('\n','').replace('/',',')
    summary_element = article.select_one(".article-body > a").text
    date_element = article.select_one(".publish-date").text
    author_element = clean_names(article.select_one(".article-byline").text)
    newslist.append([date_element,division_element,author_element,title_element,summary_element])

#Change list into table and export to .csv file
def news_scrape(section, pagelim):
    news_data = []
    with requests.Session() as session:
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(get_html, session, section, i) for i in range(1, pagelim)]
            for future in futures:
                articles = future.result()
                for article in articles:
                    news_data.append(html_seperation(article))
    all_df = pd.DataFrame(news_data, columns=['date', 'division', 'author', 'title', 'summary'])
    print(f"{section}... Success")
    return all_df


In [None]:
# Scrape news articles

section_dataframes = {}

for section in section_list:
    section_df = news_scrape(section, 1101)
    section_dataframes[section] = section_df

In [2]:
section_dataframes['japan']

Unnamed: 0,date,division,author,title,summary
0,"May 26, 2024","JAPAN, Politics",Jesse Johnson,Kishida meets China's Li and South Korea's Yoo...,The Japanese leader discussed improving ties w...
1,"May 26, 2024","JAPAN, Politics",,Constitutional Democratic Party executives can...,The CDP submitted a bill to the Lower House th...
2,"May 26, 2024","JAPAN, ANALYSIS",Gabriel Dominguez,Evolving drone and missile threats prompting T...,The low cost of mass-producing tools for moder...
3,"May 26, 2024",JAPAN,Andrew MCKIRDY,Japan wrestles with legacy of graft-stained Ga...,"Cost overruns, corruption and COVID-19 all tar..."
4,"May 25, 2024",JAPAN,,Saury fading from Japan's dining tables amid p...,Catches of saury are mired in a prolonged slum...
...,...,...,...,...,...
13195,"Jan 31, 2020",JAPAN,,Number of foreign workers in Japan totals reco...,Chinese accounted for a quarter of the total w...
13196,"Jan 31, 2020",JAPAN,,Record 31.19 million foreign nationals entered...,The number of foreign nationals who entered Ja...
13197,"Jan 31, 2020","JAPAN, Crime & Legal",,Actress Erika Sawajiri pleads guilty to posses...,Actress Erika Sawajiri pleaded guilty Friday t...
13198,"Jan 31, 2020","JAPAN, Politics",Eric Johnston,Could growing tourism troubles unseat Kyoto's ...,Kyoto voters head to the polls Sunday to cast ...


<center>
<span style="font-size: 26px;">STEP 2 : COUNTING WORDS FOR EACH SECTION</span>
</center>

In [1]:
import nltk
from nltk.corpus import stopwords
from collections import defaultdict

In [201]:
#add summary and title into dictionary file
def make_section_dictionary(sec):
    #file
    original_df = section_dataframes[sec]
    #combine last 2 columns, 'title' & 'summary', into one list
    combined_list = (original_df['title'] + ' ' + original_df['summary']).tolist()
    combined_string = " ".join(map(str,combined_list))
    #clean combined list
    combined_string_cleaned = combined_string.replace("'s", "").replace("\xa0"," ").replace("\'", "").replace("\n", "")
    return combined_string_cleaned

#For each words in result_word, leave only alphabets, then count the number of each words
def word_count(counter, result_word):
    if result_word:
        result_word = re.sub('[^A-Za-z]+', '', result_word)
        if result_word == None or result_word == '' or len(result_word) == 1: #remove Null values, or 1 character
            return
        else:
            counter[result_word] += 1   #count frequency of each words

#List of words that are used frequently, but not meaningful
def stop_words_gen():
    global stop_words
    stop_words = stopwords.words('english')
    custom =['one','two','three','since','say','said','says','also','could','would','may','many','like',
             'still','get','even','back','make','week','made','come','comes','talks','must','used','see',
             'th','set','around','take','second','among','go','big','good','long','way','monday','tuesday',
             'wednesday','thursday','Friday','Saturday','Sunday','day','years','year','year-old','yearold',
             'first','final','last']
    stop_words.extend(custom)
    stop_words.extend([word.capitalize() for word in stop_words])

#clean each words and count them 
def clean_section_dic(text):
    result = []
    counter = defaultdict(int)
    tokens = nltk.word_tokenize(text)   
    for token in tokens:
        if token not in stop_words:
            result.append(token)

    for word in result:
        word_count(counter, word)
    sorted_counter = dict(sorted(counter.items(), key=lambda item: item[1], reverse=True))
    df = pd.DataFrame(list(sorted_counter.items()), columns=['words', 'nums'])
    df = df[df['nums']>100] #remove words that were used less than 100 times
    return df

In [236]:
#Give each section a unique id (number)
section_nums = pd.DataFrame(enumerate(section_list,start=101),columns=['section','name'])
section_nums = section_nums[['name','section']]
section_nums

Unnamed: 0,name,section
0,japan,101
1,world,102
2,asia-pacific,103


In [237]:
#concatenate all dataframes
stop_words_gen()
data_frame = []
for i, section in enumerate(section_list, start=101):
    sec_sentence = make_section_dictionary(section) # Concatenate 'title'&'summary' column from scraped articles
    sec_word = clean_section_dic(sec_sentence)  # Clean the column and drop words used less than 100 times
    sec_word.insert(0, 'section', i)  # Add section number column    
    data_frame.append(sec_word) # Add counted tables to list
final_df = pd.concat(data_frame, ignore_index=True)
final_df

Unnamed: 0,section,words,nums
0,101,Japan,8382
1,101,Tokyo,2733
2,101,COVID,1877
3,101,Japanese,1694
4,101,new,1687
...,...,...,...
1390,103,whether,102
1391,103,came,101
1392,103,worst,101
1393,103,battle,101


In [238]:
#convert to .csv file to use in tableau
section_nums.to_csv('/Users/ronny/Code/Project/sections_num.csv', index=False)
final_df.to_csv('/Users/ronny/Code/Project/finaltotalnumcount.csv', index=False)

<center>
<span style="font-size: 26px;">STEP 3: Author analysis</span>
</center>

In [214]:
import datetime

In [215]:
def clean_author_word(words, col):  # clean 'words', remove irrelevent words
    if words:
        words = re.sub('[^A-Za-z]+','', words)
        if words == None or words in stop_words or words == '' or len(words) == 1:
            return()
        else:
            col.append(words)

def merge_title_summary(input_table,output_table):  # merge column 'title' and 'summary' to split into strings
    title = input_table['title'][i] if pd.notna(input_table['title'][i]) else ''
    summary = input_table['summary'][i] if pd.notna(input_table['summary'][i]) else ''
    t = title.split()
    s = summary.split()
    merged_t_s = t + s
    unclean_merge = []
    for word in merged_t_s:
        clean_author_word(word,unclean_merge)
    output_table.append(unclean_merge)
    return output_table

def seperate_authors(input):    # Seperate authors, for single article with multiple authors
    input['author'] = input['author'].str.split(',')
    sepdf = input.explode('author')
    sepdf['author'] = sepdf['author'].str.strip()
    return sepdf

def active_authors(input):  # remove authors who wrote less than 30 articles in total
    author_article_count = input['author'].value_counts()
    active_author = author_article_count[author_article_count >= 30].index  #author who wrote more than 30 articles
    fin = input[input['author'].isin(active_author)].reset_index(drop=True)
    fin['division'] = fin['division'].str.split(',')    #make division column as string to count division later
    return fin

In [240]:
original_csv = pd.DataFrame(section_dataframes['japan'])
dropauthor = original_csv[original_csv['author'].str.strip() != ''].reset_index(drop=True)
clean_merged = []

for i in range(len(dropauthor)):
    merge_title_summary(dropauthor,clean_merged)

dropauthor['merged'] = clean_merged     #add column 'merged' and add merged title and summary
author_merged = dropauthor.drop(columns=['title', 'summary'])   #drop column 'title' and 'summary'

sepauthor = seperate_authors(author_merged)    #seperate authors into different rows

#active author (authors who wrote more than 30 articles)
activeauthor = active_authors(sepauthor)

#change string into date 
for i in range(len(activeauthor['division'])):
    activeauthor.loc[i ,'date'] = pd.to_datetime(activeauthor['date'][i]).strftime('%Y %b')   #to avoid chained assignment (in future case)
    for j in range(len(activeauthor['division'][i])):
        activeauthor['division'][i][j] = activeauthor['division'][i][j].strip().capitalize()
activeauthor

Unnamed: 0,date,division,author,merged
0,"May 26, 2024","[JAPAN, Politics]",Jesse Johnson,"[Kishida, meets, Chinas, Li, South, Koreas, Yo..."
1,"May 26, 2024","[JAPAN, ANALYSIS]",Gabriel Dominguez,"[Evolving, drone, missile, threats, prompting,..."
2,"May 24, 2024",[JAPAN],Yukana Inoue,"[Emergency, probe, Japan, Airlines, carried, f..."
3,"May 24, 2024","[JAPAN, Politics, FOCUS]",Jesse Johnson,"[JapanSouth, KoreaChina, trilateral, summit, c..."
4,"May 23, 2024","[JAPAN, Science & Health]",Yukana Inoue,"[Health, ministry, panel, urges, consolidation..."
...,...,...,...,...
3458,"Feb 2, 2020","[JAPAN, Crime & Legal]",Magdalena Osumi,"[Justice, chief, Masako, Mori, defends, Japans..."
3459,"Feb 1, 2020","[JAPAN, Media, MEDIA MIX]",Philip Brasor,"[Shinjiro, Koizumis, paternity, leave, raises,..."
3460,"Feb 1, 2020","[JAPAN, Media, BIG IN JAPAN]",Mark Schreiber,"[Dispatches, front, line, Japans, retail, sect..."
3461,"Jan 31, 2020",[JAPAN],Satoshi Sugiyama,"[Japan, step, coronavirus, action, Abe, takes,..."


In [228]:
def total_author_info(input,output):
    # Iterate over each row in the DataFrame
    for index, row in input.iterrows():
        author = row['author']
        divisions = row['division']
        date = row['date']
        words = row['merged']

        if author not in output:
            output[author] = {
                'author_article_division': defaultdict(int),
                'author_used_words': defaultdict(int),
                'author_article_date': defaultdict(int)
            }

        # Update division count
        for division in divisions:
            output[author]['author_article_division'][division] += 1

        # Update word counts
        for word in words:
            output[author]['author_used_words'][word] += 1

        # Update article date count
        output[author]['author_article_date'][date] += 1
        # Convert defaultdict to dict for final output and sort dates in reverse chronological order
    
    for author in output:
        output[author]['author_article_division'] = dict(output[author]['author_article_division'])

        output[author]['author_used_words'] = dict(
            sorted(output[author]['author_used_words'].items(),
            key=lambda x: x[1], # Define the order as the count of words, which is x[1]
            reverse=True)[:20]  # Select top 20 words
        )
        output[author]['author_article_date'] = dict(
            sorted(output[author]['author_article_date'].items(),
            key=lambda x: x[0]) # Organize by date
    )

# Empty dictionary for storing author data
authors_data = {}
total_author_info(activeauthor,authors_data)


In [229]:
def authors_info_table(input,author_category):  # Create dictionaries to save counted numbers
    author_to_id = {author: id for id, author in enumerate(input.keys(), start=1)}
    author_article_count = {author: sum(data['author_article_date'].values()) for author, data in input.items()}
    rows_author = [{'Author ID': author_id,'Author Category' : author_category ,'Author': author, 'Article Count': author_article_count[author]} for author, author_id in author_to_id.items()]

    # Create DataFrames for each aspect
    rows_date = []
    rows_division = []
    rows_words = []

    # Add Author information into each DataFrames
    for author, data in input.items():
        author_id = author_to_id[author]    # Consider Author as author id
        for date, date_count in data['author_article_date'].items():    # Count number of articles written in date 
            rows_date.append({
                'Author ID': author_id,
                'Date': date,
                'Date Count': date_count
            })
        for division, division_count in data['author_article_division'].items():    # Count number of articles written in each division
            rows_division.append({
                'Author ID': author_id,
                'Division': division,
                'Division Count': division_count
            })
        for word, word_count in data['author_used_words'].items():  # Count used words for each authors 
            rows_words.append({
                'Author ID': author_id,
                'Word': word,
                'Word Count': word_count
            })
    return rows_author, rows_date, rows_division, rows_words

author_table, author_date, author_division, author_words = authors_info_table(authors_data,1001) #1001 is category id of 'japan'

In [239]:
# Change into DataFrames
df_author = pd.DataFrame(author_table)
df_date = pd.DataFrame(author_date)
df_division = pd.DataFrame(author_division)
df_words = pd.DataFrame(author_words)
df_author

Unnamed: 0,Author ID,Date,Date Count
0,1,2015 Dec,1
1,1,2016 Apr,9
2,1,2016 Aug,12
3,1,2016 Dec,9
4,1,2016 Feb,2
...,...,...,...
731,22,2022 Mar,2
732,22,2022 May,3
733,22,2023 Apr,2
734,22,2023 Mar,4


<center>
<span style="font-size: 26px;">Convert Table into .csv file for Analysis</span>
</center>

In [234]:
# Create normalized tables for each section
for sec_num,section in enumerate(section_list,start=1001):
    original_csv = pd.DataFrame(section_dataframes[section])
    dropauthor = original_csv[original_csv['author'].str.strip() != ''].reset_index(drop=True)
    clean_merged = []

    for i in range(len(dropauthor)):
        merge_title_summary(dropauthor,clean_merged)

    dropauthor['merged'] = clean_merged     #add column 'merged' and add merged title and summary
    author_merged = dropauthor.drop(columns=['title', 'summary'])   #drop column 'title' and 'summary'

    sepauthor = seperate_authors(author_merged)    #seperate authors into different rows

    #active author (authors who wrote more than 30 articles)
    activeauthor = active_authors(sepauthor)

    #change string into date 
    for i in range(len(activeauthor['division'])):
        activeauthor.loc[i ,'date'] = pd.to_datetime(activeauthor['date'][i]).strftime('%Y %b')   #to avoid chained assignment (in future case)
        for j in range(len(activeauthor['division'][i])):
            activeauthor['division'][i][j] = activeauthor['division'][i][j].strip().capitalize()

    authors_data = {}
    total_author_info(activeauthor,authors_data)
    author_table, author_date, author_division, author_words = authors_info_table(authors_data,sec_num)
    
    df_author = pd.DataFrame(author_table)
    df_date = pd.DataFrame(author_date)
    df_division = pd.DataFrame(author_division)
    df_words = pd.DataFrame(author_words)

    df_author.to_csv(f'/Users/ronny/Code/Project/newauthor/{section}-authorinfo.csv', index=False)
    df_date.to_csv(f'/Users/ronny/Code/Project/newauthor/{section}-date.csv', index=False)
    df_division.to_csv(f'/Users/ronny/Code/Project/newauthor/{section}-division.csv', index=False)
    df_words.to_csv(f'/Users/ronny/Code/Project/newauthor/{section}-words.csv', index=False)