# [Data Collection]

## 1. Merge collected data

In [1]:
import pandas as pd
import glob
from datetime import datetime

In [2]:
# read csv files

file_paths = glob.glob('/Users/uriko/Desktop/UoL/Term 3/data/topviews-*.csv')
data_list = []

In [3]:
for file_path in file_paths:
    # extract date from file name
    file_name = file_path.split('/')[-1]
    date_str = file_name.split('-')[1].replace('_', '-').replace('.csv', '')
    date = datetime.strptime(date_str, '%Y-%m')
    
    df = pd.read_csv(file_path)
    df['date'] = date
    data_list.append(df)

df = pd.concat(data_list, ignore_index=True)

In [4]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date
0,Cleopatra,4,4,5286746,98.9,2022-06-01
1,Top Gun: Maverick,1024,433,4956212,76.4,2022-06-01
2,Roe v. Wade,305,86,4399634,78.5,2022-06-01
3,Stranger Things (season 4),0,0,4320822,78.3,2022-06-01
4,Vikram (2022 film),525,110,4135394,87.2,2022-06-01
...,...,...,...,...,...,...
59063,Whitney Houston,?,?,276438,,2022-08-01
59064,Partition of India,?,?,276136,,2022-08-01
59065,Rita Ora,?,?,276122,,2022-08-01
59066,Kaduva,?,?,276056,,2022-08-01


In [6]:
df['date'].unique()

<DatetimeArray>
['2022-06-01 00:00:00', '2022-12-01 00:00:00', '2020-03-01 00:00:00',
 '2020-02-01 00:00:00', '2022-07-01 00:00:00', '2022-11-01 00:00:00',
 '2022-05-01 00:00:00', '2020-01-01 00:00:00', '2022-04-01 00:00:00',
 '2022-10-01 00:00:00', '2020-11-01 00:00:00', '2020-05-01 00:00:00',
 '2020-04-01 00:00:00', '2020-10-01 00:00:00', '2022-01-01 00:00:00',
 '2022-03-01 00:00:00', '2020-06-01 00:00:00', '2020-12-01 00:00:00',
 '2020-07-01 00:00:00', '2022-02-01 00:00:00', '2021-08-01 00:00:00',
 '2021-09-01 00:00:00', '2019-09-01 00:00:00', '2023-08-01 00:00:00',
 '2023-09-01 00:00:00', '2019-08-01 00:00:00', '2019-11-01 00:00:00',
 '2019-05-01 00:00:00', '2021-01-01 00:00:00', '2023-10-01 00:00:00',
 '2023-04-01 00:00:00', '2023-05-01 00:00:00', '2023-11-01 00:00:00',
 '2019-04-01 00:00:00', '2019-10-01 00:00:00', '2019-06-01 00:00:00',
 '2019-12-01 00:00:00', '2021-02-01 00:00:00', '2023-07-01 00:00:00',
 '2023-12-01 00:00:00', '2023-06-01 00:00:00', '2021-03-01 00:00:00',
 '20

In [7]:
# drop rows having '?' in Edits and Editors columns
df = df[~df['Edits'].str.contains('\?') & ~df['Editors'].str.contains('\?')]

# reset index
df.reset_index(drop=True, inplace=True)

In [8]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date
0,Cleopatra,4,4,5286746,98.9,2022-06-01
1,Top Gun: Maverick,1024,433,4956212,76.4,2022-06-01
2,Roe v. Wade,305,86,4399634,78.5,2022-06-01
3,Stranger Things (season 4),0,0,4320822,78.3,2022-06-01
4,Vikram (2022 film),525,110,4135394,87.2,2022-06-01
...,...,...,...,...,...,...
5989,CEO,0,0,942147,1.7,2022-08-01
5990,Chloe Lattanzi,28,21,935080,82.2,2022-08-01
5991,Biagio da Cesena,9,8,933621,82.8,2022-08-01
5992,Microsoft Windows,3,2,932748,7.1,2022-08-01


## 2. Collect text data in summary section from article (api)

In [9]:
import requests

def get_wikipedia_article(title, lang='en'):
    # Wikipedia API 엔드포인트
    endpoint = f"https://{lang}.wikipedia.org/w/api.php"

    # 요청 파라미터 설정
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True,  # 서론 내용을 가져옴
        "explaintext": True  # HTML 태그를 제거하고 텍스트만 가져옴
    }

    # API 요청
    response = requests.get(endpoint, params=params)
    data = response.json()

    # 요청한 문서의 내용을 추출
    page = next(iter(data['query']['pages'].values()))
    if 'extract' in page:
        return page['extract']
    else:
        return "문서를 찾을 수 없습니다."

# 예제 사용
# 데이터프레임의 각 행에 대해 Wikipedia 문서의 전체 내용을 가져와 'summary' 컬럼에 추가합니다.
df['summary'] = df['Page'].apply(lambda title: get_wikipedia_article(title))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['summary'] = df['Page'].apply(lambda title: get_wikipedia_article(title))


In [10]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,Cleopatra,4,4,5286746,98.9,2022-06-01,Cleopatra VII Thea Philopator (Koinē Greek: Κλ...
1,Top Gun: Maverick,1024,433,4956212,76.4,2022-06-01,Top Gun: Maverick is a 2022 American action dr...
2,Roe v. Wade,305,86,4399634,78.5,2022-06-01,"Roe v. Wade, 410 U.S. 113 (1973), was a landma..."
3,Stranger Things (season 4),0,0,4320822,78.3,2022-06-01,
4,Vikram (2022 film),525,110,4135394,87.2,2022-06-01,Vikram is a 2022 Indian Tamil-language action ...
...,...,...,...,...,...,...,...
5989,CEO,0,0,942147,1.7,2022-08-01,
5990,Chloe Lattanzi,28,21,935080,82.2,2022-08-01,"Chloe Rose Lattanzi (born January 17, 1986) is..."
5991,Biagio da Cesena,9,8,933621,82.8,2022-08-01,"Biagio Martinelli (Cesena 1463 – Rome 1544), b..."
5992,Microsoft Windows,3,2,932748,7.1,2022-08-01,Microsoft Windows is a product line of proprie...


In [11]:
# check missing data in the summary column

df[df['summary'].isnull() | (df['summary'] == '')]

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
3,Stranger Things (season 4),0,0,4320822,78.3,2022-06-01,
7,Deaths in 2022,0,0,3598277,48.8,2022-06-01,
13,Obi-Wan Kenobi (TV series),0,0,2964187,64.8,2022-06-01,
14,2022 Russian invasion of Ukraine,0,0,2834869,52.1,2022-06-01,
33,Ms. Marvel (TV series),0,0,1731280,67.2,2022-06-01,
...,...,...,...,...,...,...,...
5901,Deaths in 2022,0,0,4129478,49.8,2022-08-01,
5916,2022 Russian invasion of Ukraine,0,0,1929415,52.2,2022-08-01,
5932,Better Call Saul (season 6),0,0,1463423,58.4,2022-08-01,
5958,2022 monkeypox outbreak,0,0,1152672,81.4,2022-08-01,


In [14]:
# apply redirects api to fill the missing data in the summary column

def get_wikipedia_article(title, lang='en'):
    # Wikipedia API 엔드포인트
    endpoint = f"https://{lang}.wikipedia.org/w/api.php"

    # 요청 파라미터 설정
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True,  # 서론 내용을 가져옴
        "explaintext": True,  # HTML 태그를 제거하고 텍스트만 가져옴
        "redirects": True  # 제목이 변경된 경우에도 리디렉션하여 가져옴
    }

    # API 요청
    response = requests.get(endpoint, params=params)
    data = response.json()

    # 요청한 문서의 내용을 추출
    page = next(iter(data['query']['pages'].values()))
    if 'extract' in page:
        return page['extract']
    else:
        return "문서를 찾을 수 없습니다."

# summary 컬럼이 공백인 행에 대해서만 API를 호출하여 summary 데이터를 채웁니다.
def fill_missing_summary(row):
    if pd.isnull(row['summary']) or row['summary'] == '':
        article_title = row['Page']
        try:
            row['summary'] = get_wikipedia_article(article_title)
        except Exception as e:
            print(f"Failed to fetch summary for '{article_title}': {e}")
    return row

# apply 함수를 사용하여 각 행에 대해 fill_missing_summary 함수를 적용합니다.
df = df.apply(fill_missing_summary, axis=1)


In [15]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,Cleopatra,4,4,5286746,98.9,2022-06-01,Cleopatra VII Thea Philopator (Koinē Greek: Κλ...
1,Top Gun: Maverick,1024,433,4956212,76.4,2022-06-01,Top Gun: Maverick is a 2022 American action dr...
2,Roe v. Wade,305,86,4399634,78.5,2022-06-01,"Roe v. Wade, 410 U.S. 113 (1973), was a landma..."
3,Stranger Things (season 4),0,0,4320822,78.3,2022-06-01,The fourth season of the American science fict...
4,Vikram (2022 film),525,110,4135394,87.2,2022-06-01,Vikram is a 2022 Indian Tamil-language action ...
...,...,...,...,...,...,...,...
5989,CEO,0,0,942147,1.7,2022-08-01,A chief executive officer (CEO) (chief executi...
5990,Chloe Lattanzi,28,21,935080,82.2,2022-08-01,"Chloe Rose Lattanzi (born January 17, 1986) is..."
5991,Biagio da Cesena,9,8,933621,82.8,2022-08-01,"Biagio Martinelli (Cesena 1463 – Rome 1544), b..."
5992,Microsoft Windows,3,2,932748,7.1,2022-08-01,Microsoft Windows is a product line of proprie...


In [16]:
df[df['summary'].isnull() | (df['summary'] == '')]

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary


In [17]:
df.to_csv('/Users/uriko/Desktop/UoL/Term 3/data/final.csv', index=False)

In [20]:
missing_data = df[df.isnull() | (df == '')]
missing_data

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,,,,,,NaT,
1,,,,,,NaT,
2,,,,,,NaT,
3,,,,,,NaT,
4,,,,,,NaT,
...,...,...,...,...,...,...,...
5989,,,,,,NaT,
5990,,,,,,NaT,
5991,,,,,,NaT,
5992,,,,,,NaT,
