# [Data Collection]

## 1. Merge collected data

In [1]:
import pandas as pd
import glob
from datetime import datetime

In [2]:
# read csv files

file_paths = glob.glob('/Users/uriko/Desktop/UoL/Term 3/data-de/topviews-*.csv')
data_list = []

In [3]:
for file_path in file_paths:
    # extract date from file name
    file_name = file_path.split('/')[-1]
    date_str = file_name.split('-')[1].replace('_', '-').replace('.csv', '')
    date = datetime.strptime(date_str, '%Y-%m')
    
    df = pd.read_csv(file_path)
    df['date'] = date
    data_list.append(df)

df = pd.concat(data_list, ignore_index=True)

In [4]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date
0,DEV,0,0,1110664,0.2,2022-06-01
1,Stranger Things,66,30,598906,79.6,2022-06-01
2,Fronleichnam,145,21,516868,73.8,2022-06-01
3,Nekrolog 2022,11,9,457694,37.8,2022-06-01
4,Pfingsten,12,10,407362,78.7,2022-06-01
...,...,...,...,...,...,...
58815,Arch Enemy,?,?,36081,,2022-08-01
58816,Sansibar,?,?,36079,,2022-08-01
58817,Sri Lanka,?,?,36056,,2022-08-01
58818,Conor McGregor,?,?,36055,,2022-08-01


In [5]:
df['date'].unique()

<DatetimeArray>
['2022-06-01 00:00:00', '2022-12-01 00:00:00', '2020-03-01 00:00:00',
 '2020-02-01 00:00:00', '2022-07-01 00:00:00', '2022-11-01 00:00:00',
 '2022-05-01 00:00:00', '2020-01-01 00:00:00', '2022-04-01 00:00:00',
 '2022-10-01 00:00:00', '2020-11-01 00:00:00', '2020-05-01 00:00:00',
 '2020-04-01 00:00:00', '2020-10-01 00:00:00', '2022-01-01 00:00:00',
 '2022-03-01 00:00:00', '2020-06-01 00:00:00', '2020-12-01 00:00:00',
 '2020-07-01 00:00:00', '2022-02-01 00:00:00', '2021-08-01 00:00:00',
 '2021-09-01 00:00:00', '2019-09-01 00:00:00', '2023-08-01 00:00:00',
 '2023-09-01 00:00:00', '2019-08-01 00:00:00', '2019-11-01 00:00:00',
 '2019-05-01 00:00:00', '2021-01-01 00:00:00', '2023-10-01 00:00:00',
 '2023-04-01 00:00:00', '2023-05-01 00:00:00', '2023-11-01 00:00:00',
 '2019-04-01 00:00:00', '2019-10-01 00:00:00', '2019-06-01 00:00:00',
 '2019-12-01 00:00:00', '2021-02-01 00:00:00', '2023-07-01 00:00:00',
 '2023-12-01 00:00:00', '2023-06-01 00:00:00', '2021-03-01 00:00:00',
 '20

In [6]:
# drop rows having '?' in Edits and Editors columns
df = df[~df['Edits'].str.contains('\?') & ~df['Editors'].str.contains('\?')]

# reset index
df.reset_index(drop=True, inplace=True)

In [7]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date
0,DEV,0,0,1110664,0.2,2022-06-01
1,Stranger Things,66,30,598906,79.6,2022-06-01
2,Fronleichnam,145,21,516868,73.8,2022-06-01
3,Nekrolog 2022,11,9,457694,37.8,2022-06-01
4,Pfingsten,12,10,407362,78.7,2022-06-01
...,...,...,...,...,...,...
5957,Tatort: Reifezeugnis,9,8,108670,72.8,2022-08-01
5958,James Tupper,5,3,107564,82.8,2022-08-01
5959,Liste der IPA-Zeichen,1,1,107233,62.7,2022-08-01
5960,Lisa-Maria Kellermayr,176,61,105825,49.5,2022-08-01


## 2. Collect text data in summary section from article (api)

In [9]:
import requests

def get_wikipedia_article(title, lang='de'):
    # Wikipedia API endpoint
    endpoint = f"https://{lang}.wikipedia.org/w/api.php"

    # setting parameters
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True,  # crawling text in summary section
        "explaintext": True,  # crawling only text, removing HTML tag (hyperlinks)
        "redirects": True  # crawling only text, removing HTML tag (hyperlinks)
    }

    # request API
    try:
        response = requests.get(endpoint, params=params)
        response.raise_for_status()  # exception when an HTTP error occurs
        data = response.json()  # errors may occur when decoding JSON
    except requests.RequestException as e:
        print(f"Request error for '{title}': {e}")
        return "request error"
    except ValueError as e:
        print(f"JSON decode error for '{title}': {e}")
        return "JSON decode error"

    # extract text
    page = next(iter(data['query']['pages'].values()))
    if 'extract' in page:
        return page['extract']
    else:
        return "cannot find document."

# add text into the summary column by matching the title data of the Page column
df['summary'] = df['Page'].apply(lambda title: get_wikipedia_article(title))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['summary'] = df['Page'].apply(lambda title: get_wikipedia_article(title))


In [10]:
df

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,DEV,0,0,1110664,0.2,2022-06-01,DEV steht für:\n\nDeutsche Einheitsverfahren z...
1,Stranger Things,66,30,598906,79.6,2022-06-01,Stranger Things (englisch für ‚merkwürdigere D...
2,Fronleichnam,145,21,516868,73.8,2022-06-01,Das Fronleichnamsfest ist ein Hochfest im Kirc...
3,Nekrolog 2022,11,9,457694,37.8,2022-06-01,Dies ist eine Liste von im Jahr 2022 verstorbe...
4,Pfingsten,12,10,407362,78.7,2022-06-01,Pfingsten (von griech. „fünfzigster Tag“) ist ...
...,...,...,...,...,...,...,...
5957,Tatort: Reifezeugnis,9,8,108670,72.8,2022-08-01,Reifezeugnis ist ein deutscher Fernsehfilm der...
5958,James Tupper,5,3,107564,82.8,2022-08-01,James Howard Tupper (* 4. August 1965 in Dartm...
5959,Liste der IPA-Zeichen,1,1,107233,62.7,2022-08-01,Diese Liste der Zeichen des Internationalen Ph...
5960,Lisa-Maria Kellermayr,176,61,105825,49.5,2022-08-01,Lisa-Maria Kellermayr (* 22. Oktober 1985; † 2...


In [11]:
# check missing data in the summary column

df[df['summary'].isnull() | (df['summary'] == '')]

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
4278,Formelsammlung Trigonometrie,3,3,471817,2.1,2019-03-01,


In [12]:
# add missing data manually 

df.loc[df['Page'] == 'Formelsammlung Trigonometrie', 'summary'] = 'Dieser Artikel ist eine Formelsammlung zum Thema Trigonometrie. Es werden mathematische Symbole verwendet, die im Artikel Liste mathematischer Symbole erläutert werden.'

In [13]:
df[df['summary'].isnull() | (df['summary'] == '')]

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary


In [14]:
missing_data = df[df.isnull() | (df == '')]
missing_data

Unnamed: 0,Page,Edits,Editors,Views,Mobile %,date,summary
0,,,,,,NaT,
1,,,,,,NaT,
2,,,,,,NaT,
3,,,,,,NaT,
4,,,,,,NaT,
...,...,...,...,...,...,...,...
5957,,,,,,NaT,
5958,,,,,,NaT,
5959,,,,,,NaT,
5960,,,,,,NaT,


In [15]:
df.to_csv('/Users/uriko/Desktop/UoL/Term 3/data-de/final_de.csv', index=False)