# [Data Collection]

## 1. Merge collected data

In [1]:
import pandas as pd
import glob
from datetime import datetime

In [2]:
# read csv files

file_paths = glob.glob('/Users/uriko/Desktop/UoL/Term 3/data-ko/topviews-*.csv')
data_list = []

In [3]:
for file_path in file_paths:
    # extract date from file name
    file_name = file_path.split('/')[-1]
    date_str = file_name.split('-')[1].replace('_', '-').replace('.csv', '')
    date = datetime.strptime(date_str, '%Y-%m')
    
    df = pd.read_csv(file_path)
    df['date'] = date
    data_list.append(df)

df = pd.concat(data_list, ignore_index=True)

In [4]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date
0,문화방송,20,13,886264,69.6,2022-06-01
1,한국방송공사,62,23,631101,80.1,2022-06-01
2,클레오파트라,10,5,436180,99.8,2022-06-01
3,유튜브,5,3,237653,18.3,2022-06-01
4,한국교육방송공사,9,9,143655,69.9,2022-06-01
...,...,...,...,...,...,...
58882,쌍반점,?,?,3428,,2022-08-01
58883,김민재 (1996년),?,?,3427,,2022-08-01
58884,이종찬 (1936년),?,?,3426,,2022-08-01
58885,김영삼,?,?,3421,,2022-08-01


In [5]:
df['date'].unique()

<DatetimeArray>
['2022-06-01 00:00:00', '2022-12-01 00:00:00', '2020-03-01 00:00:00',
 '2020-02-01 00:00:00', '2022-07-01 00:00:00', '2022-11-01 00:00:00',
 '2022-05-01 00:00:00', '2020-01-01 00:00:00', '2022-04-01 00:00:00',
 '2022-10-01 00:00:00', '2020-11-01 00:00:00', '2020-05-01 00:00:00',
 '2020-04-01 00:00:00', '2020-10-01 00:00:00', '2022-01-01 00:00:00',
 '2022-03-01 00:00:00', '2020-06-01 00:00:00', '2020-12-01 00:00:00',
 '2020-07-01 00:00:00', '2022-02-01 00:00:00', '2021-08-01 00:00:00',
 '2021-09-01 00:00:00', '2019-09-01 00:00:00', '2023-08-01 00:00:00',
 '2023-09-01 00:00:00', '2019-08-01 00:00:00', '2019-11-01 00:00:00',
 '2019-05-01 00:00:00', '2021-01-01 00:00:00', '2023-10-01 00:00:00',
 '2023-04-01 00:00:00', '2023-05-01 00:00:00', '2023-11-01 00:00:00',
 '2019-04-01 00:00:00', '2019-10-01 00:00:00', '2019-06-01 00:00:00',
 '2019-12-01 00:00:00', '2021-02-01 00:00:00', '2023-07-01 00:00:00',
 '2023-12-01 00:00:00', '2023-06-01 00:00:00', '2021-03-01 00:00:00',
 '20

In [6]:
# drop rows having '?' in Edits and Editors columns
df = df[~df['Edits'].str.contains('\?') & ~df['Editors'].str.contains('\?')]

# reset index
df.reset_index(drop=True, inplace=True)

In [7]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date
0,문화방송,20,13,886264,69.6,2022-06-01
1,한국방송공사,62,23,631101,80.1,2022-06-01
2,클레오파트라,10,5,436180,99.8,2022-06-01
3,유튜브,5,3,237653,18.3,2022-06-01
4,한국교육방송공사,9,9,143655,69.9,2022-06-01
...,...,...,...,...,...,...
5920,구강성교,0,0,13468,91.6,2022-08-01
5921,나르시시즘,0,0,13462,89.1,2022-08-01
5922,미남당,54,22,13340,73.7,2022-08-01
5923,주기율표,4,3,13229,28.2,2022-08-01


## 2. Collect text data in summary section from article (api)

In [8]:
import requests

def get_wikipedia_article(title, lang='ko'):
    # Wikipedia API endpoint
    endpoint = f"https://{lang}.wikipedia.org/w/api.php"

    # setting parameters
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True,  # crawling text in summary section
        "explaintext": True,  # crawling only text, removing HTML tag (hyperlinks)
        "redirects": True  # if the title of article changes, find redirected page
    }

    # request API
    try:
        response = requests.get(endpoint, params=params)
        response.raise_for_status()  # exception when an HTTP error occurs
        data = response.json()  # errors may occur when decoding JSON
    except requests.RequestException as e:
        print(f"Request error for '{title}': {e}")
        return "request error"
    except ValueError as e:
        print(f"JSON decode error for '{title}': {e}")
        return "JSON decode error"

    # extract text
    page = next(iter(data['query']['pages'].values()))
    if 'extract' in page:
        return page['extract']
    else:
        return "cannot find document."

# add text into the summary column by matching the title data of the Page column
df['summary'] = df['Page'].apply(lambda title: get_wikipedia_article(title))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['summary'] = df['Page'].apply(lambda title: get_wikipedia_article(title))


In [9]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,문화방송,20,13,886264,69.6,2022-06-01,"문화방송(文化放送, 영어: Munhwa Broadcasting Corporation..."
1,한국방송공사,62,23,631101,80.1,2022-06-01,"한국방송공사(韓國放送公社, 영어: Korean Broadcasting System,..."
2,클레오파트라,10,5,436180,99.8,2022-06-01,"클레오파트라 7세 필로파토르(그리스어: Κλεοπάτρα Φιλοπάτωρ, 영어:..."
3,유튜브,5,3,237653,18.3,2022-06-01,유튜브(영어: YouTube)는 2005년 서비스를 시작한 동영상 공유 플랫폼이다....
4,한국교육방송공사,9,9,143655,69.9,2022-06-01,"한국교육방송공사(韓國敎育放送公社, 영어: Korea Educational Broad..."
...,...,...,...,...,...,...,...
5920,구강성교,0,0,13468,91.6,2022-08-01,"구강성교(한국 한자: 口腔性交, 영어: oral sex)란 입, 혀, 식도, 치아,..."
5921,나르시시즘,0,0,13462,89.1,2022-08-01,"나르시시즘(영어: narcissism) 또는 자기애(自己愛, self-love)는 ..."
5922,미남당,54,22,13340,73.7,2022-08-01,《미남당》은 2022년 6월 27일부터 2022년 8월 23일까지 방영된 KBS 2...
5923,주기율표,4,3,13229,28.2,2022-08-01,"주기율표(週期律表, 문화어: 주기률표, 영어: periodic table) 또는 주..."


In [10]:
# check missing data in the summary column

df[df['summary'].isnull() | (df['summary'] == '')]

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary


In [11]:
missing_data = df[df.isnull() | (df == '')]
missing_data

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,,,,,,NaT,
1,,,,,,NaT,
2,,,,,,NaT,
3,,,,,,NaT,
4,,,,,,NaT,
...,...,...,...,...,...,...,...
5920,,,,,,NaT,
5921,,,,,,NaT,
5922,,,,,,NaT,
5923,,,,,,NaT,


In [12]:
df.to_csv('/Users/uriko/Desktop/UoL/Term 3/data-ko/final_ko.csv', index=False)