# Making original large CSV files

This is the notebook where I was able to get the **5 different CSV files** for each country.

The CSV files include the following countries: **Australia, Canada, India, United Kingdom, and the United States**

These files contain the **pageviews from 2023 - 2024** for each country's **top 10,000 viewed articles**.

*This code was edited from the code provided in the Final Project folder from Google Drive*

### Australia

In [None]:
# Script to read DBPD files, and get lines that fit a criteria
import pandas as pd
import requests, json, datetime, time, io, csv


# Helper function
def write_to_file(date, entries, saveFile):
    """Write data to the file, appending each line at the end of file.
    It also adds the data in front of the line.
    """
    with open(saveFile, 'a', encoding="utf-8") as outf:
        for line, count in entries:
            outf.write(f"{date}\t{line}\t{count}")

baseURL = "https://analytics.wikimedia.org/published/datasets/country_project_page"
startDate1 = "2023-02-06"
endDate1 = "2023-12-31"
startDate2 = "2024-01-01"
endDate2 = "2024-12-31"

dateRange1 = pd.date_range(start=startDate1,end=endDate1)
dateRange2 = pd.date_range(start=startDate2,end=endDate2)

saveFile = "filtered_en_wiki_data.csv"
ourEntries = []

for dateRange in [dateRange1, dateRange2]:

    for date in dateRange.to_list():
        date = str(date.date())

        URL = f"{baseURL}/{date}.tsv"
        #print(URL)

        response = requests.get(URL, 
                                headers={"User-agent":"Wikipedia Bot, student project"})
        
        if response.status_code == 200:
            data = io.StringIO(response.text)
        else:
            print(f"Error: {response.status_code} for date={date}")
            continue

        for entry in data:

            # Only English Wikipedias
            values = entry.strip("\n").split('\t')
            #print(values)
            if values[2].strip() == 'en.wikipedia' and values[0].strip() == 'Australia':
                values = [date] + values
                ourEntries.append(values)


columns = ["date", "country", "country_code", "project", "page_id", "article", "qid", "views"]
df = pd.DataFrame(ourEntries, columns=columns)

df["views"] = pd.to_numeric(df["views"], errors="coerce").fillna(0).astype(int)
df["date"] = pd.to_datetime(df["date"])

info = df.groupby("article", as_index=False)["views"].sum()

top_articles = info.sort_values("views", ascending=False).head(10000)["article"]

filtered_df = df[df["article"].isin(top_articles)]

filtered_df.to_csv("top_australia.csv", index=False)
filtered_df.head()

Unnamed: 0,date,country,country_code,project,page_id,article,qid,views
0,2023-02-06,Australia,AU,en.wikipedia,19980263,Margot_Robbie,Q1924847,299
1,2023-02-06,Australia,AU,en.wikipedia,28786805,Karl_Stefanovic,Q6372275,90
3,2023-02-06,Australia,AU,en.wikipedia,44059,Harrison_Ford,Q81328,367
4,2023-02-06,Australia,AU,en.wikipedia,46823,George_V,Q269412,125
5,2023-02-06,Australia,AU,en.wikipedia,52382,Watergate_scandal,Q42761,97


### India

In [None]:
# Script to read DBPD files, and get lines that fit a criteria
import pandas as pd
import requests, json, datetime, time, io, csv

baseURL = "https://analytics.wikimedia.org/published/datasets/country_project_page"
startDate1 = "2023-02-06"
endDate1 = "2023-12-31"
startDate2 = "2024-01-01"
endDate2 = "2024-12-31"

dateRange1 = pd.date_range(start=startDate1,end=endDate1)
dateRange2 = pd.date_range(start=startDate2,end=endDate2)

saveFile = "filtered_en_wiki_data.csv"
ourEntries = []

for dateRange in [dateRange1, dateRange2]:

    for date in dateRange.to_list():
        date = str(date.date())

        URL = f"{baseURL}/{date}.tsv"
        #print(URL)

        response = requests.get(URL, 
                                headers={"User-agent":"Wikipedia Bot, student project"})
        
        if response.status_code == 200:
            data = io.StringIO(response.text)
        else:
            print(f"Error: {response.status_code} for date={date}")
            continue

        for entry in data:

            # Only English Wikipedias
            values = entry.strip("\n").split('\t')
            #print(values)
            if values[2].strip() == 'en.wikipedia' and values[0].strip() == 'India':
                values = [date] + values
                ourEntries.append(values)


columns = ["date", "country", "country_code", "project", "page_id", "article", "qid", "views"]
df = pd.DataFrame(ourEntries, columns=columns)

df["views"] = pd.to_numeric(df["views"], errors="coerce").fillna(0).astype(int)
df["date"] = pd.to_datetime(df["date"])

info = df.groupby("article", as_index=False)["views"].sum()

top_articles = info.sort_values("views", ascending=False).head(10000)["article"]

filtered_df = df[df["article"].isin(top_articles)]

filtered_df.to_csv("top_india.csv", index=False)
filtered_df.head()

Unnamed: 0,date,country,country_code,project,page_id,article,qid,views
4,2023-02-06,India,IN,en.wikipedia,11353578,Priyanshu_Chatterjee,Q7246522,129
9,2023-02-06,India,IN,en.wikipedia,1458025,Satavahana_dynasty,Q5257,566
13,2023-02-06,India,IN,en.wikipedia,15719907,Sarath_Babu,Q3595531,304
15,2023-02-06,India,IN,en.wikipedia,16709162,Tencent,Q860580,261
18,2023-02-06,India,IN,en.wikipedia,17384301,Liver,Q9368,248


### Canada

In [2]:
# Script to read DBPD files, and get lines that fit a criteria
import pandas as pd
import requests, json, datetime, time, io, csv


# Helper function
def write_to_file(date, entries, saveFile):
    """Write data to the file, appending each line at the end of file.
    It also adds the data in front of the line.
    """
    with open(saveFile, 'a', encoding="utf-8") as outf:
        for line, count in entries:
            outf.write(f"{date}\t{line}\t{count}")

baseURL = "https://analytics.wikimedia.org/published/datasets/country_project_page"
startDate1 = "2023-02-06"
endDate1 = "2023-12-31"
startDate2 = "2024-01-01"
endDate2 = "2024-12-31"

dateRange1 = pd.date_range(start=startDate1,end=endDate1)
dateRange2 = pd.date_range(start=startDate2,end=endDate2)

saveFile = "filtered_en_wiki_data.csv"
ourEntries = []

for dateRange in [dateRange1, dateRange2]:

    for date in dateRange.to_list():
        date = str(date.date())

        URL = f"{baseURL}/{date}.tsv"
        #print(URL)

        response = requests.get(URL, 
                                headers={"User-agent":"Wikipedia Bot, student project"})
        
        if response.status_code == 200:
            data = io.StringIO(response.text)
        else:
            print(f"Error: {response.status_code} for date={date}")
            continue

        for entry in data:

            # Only English Wikipedias
            values = entry.strip("\n").split('\t')
            #print(values)
            if values[2].strip() == 'en.wikipedia' and values[0].strip() == 'Canada':
                values = [date] + values
                ourEntries.append(values)


columns = ["date", "country", "country_code", "project", "page_id", "article", "qid", "views"]
df = pd.DataFrame(ourEntries, columns=columns)

df["views"] = pd.to_numeric(df["views"], errors="coerce").fillna(0).astype(int)
df["date"] = pd.to_datetime(df["date"])

info = df.groupby("article", as_index=False)["views"].sum()

top_articles = info.sort_values("views", ascending=False).head(10000)["article"]

filtered_df = df[df["article"].isin(top_articles)]

filtered_df.to_csv("top_canada.csv", index=False)
filtered_df.head()

Unnamed: 0,date,country,country_code,project,page_id,article,qid,views
3,2023-02-06,Canada,CA,en.wikipedia,16422,Joni_Mitchell,Q205721,174
4,2023-02-06,Canada,CA,en.wikipedia,170388,Ash_Wednesday,Q123542,102
7,2023-02-06,Canada,CA,en.wikipedia,18717177,Shrek,Q483815,138
8,2023-02-06,Canada,CA,en.wikipedia,1923870,The_Pirate_Bay,Q22663,434
9,2023-02-06,Canada,CA,en.wikipedia,2099821,Dallas_Green_(musician),Q1094052,151


### United States

In [3]:
# Script to read DBPD files, and get lines that fit a criteria
import pandas as pd
import requests, json, datetime, time, io, csv


# Helper function
def write_to_file(date, entries, saveFile):
    """Write data to the file, appending each line at the end of file.
    It also adds the data in front of the line.
    """
    with open(saveFile, 'a', encoding="utf-8") as outf:
        for line, count in entries:
            outf.write(f"{date}\t{line}\t{count}")

baseURL = "https://analytics.wikimedia.org/published/datasets/country_project_page"
startDate1 = "2023-02-06"
endDate1 = "2023-12-31"
startDate2 = "2024-01-01"
endDate2 = "2024-12-31"

dateRange1 = pd.date_range(start=startDate1,end=endDate1)
dateRange2 = pd.date_range(start=startDate2,end=endDate2)

saveFile = "filtered_en_wiki_data.csv"
ourEntries = []

for dateRange in [dateRange1, dateRange2]:

    for date in dateRange.to_list():
        date = str(date.date())

        URL = f"{baseURL}/{date}.tsv"
        #print(URL)

        response = requests.get(URL, 
                                headers={"User-agent":"Wikipedia Bot, student project"})
        
        if response.status_code == 200:
            data = io.StringIO(response.text)
        else:
            print(f"Error: {response.status_code} for date={date}")
            continue

        for entry in data:

            # Only English Wikipedias
            values = entry.strip("\n").split('\t')
            #print(values)
            if values[2].strip() == 'en.wikipedia' and values[0].strip() == 'United States':
                values = [date] + values
                ourEntries.append(values)


columns = ["date", "country", "country_code", "project", "page_id", "article", "qid", "views"]
df = pd.DataFrame(ourEntries, columns=columns)

df["views"] = pd.to_numeric(df["views"], errors="coerce").fillna(0).astype(int)
df["date"] = pd.to_datetime(df["date"])

info = df.groupby("article", as_index=False)["views"].sum()

top_articles = info.sort_values("views", ascending=False).head(10000)["article"]

filtered_df = df[df["article"].isin(top_articles)]

filtered_df.to_csv("top_united_states.csv", index=False)
filtered_df.head()

Unnamed: 0,date,country,country_code,project,page_id,article,qid,views
7,2023-02-06,United States,US,en.wikipedia,11061916,Calvin_Johnson,Q857634,1449
11,2023-02-06,United States,US,en.wikipedia,1224736,John_Drew_Barrymore,Q962932,1263
12,2023-02-06,United States,US,en.wikipedia,127062,Staten_Island,Q18432,1313
16,2023-02-06,United States,US,en.wikipedia,13815,Heracles,Q122248,1655
24,2023-02-06,United States,US,en.wikipedia,159362,John_F._Kennedy_Jr.,Q316064,3003


### United Kingdom

In [4]:
# Script to read DBPD files, and get lines that fit a criteria
import pandas as pd
import requests, json, datetime, time, io, csv


# Helper function
def write_to_file(date, entries, saveFile):
    """Write data to the file, appending each line at the end of file.
    It also adds the data in front of the line.
    """
    with open(saveFile, 'a', encoding="utf-8") as outf:
        for line, count in entries:
            outf.write(f"{date}\t{line}\t{count}")

baseURL = "https://analytics.wikimedia.org/published/datasets/country_project_page"
startDate1 = "2023-02-06"
endDate1 = "2023-12-31"
startDate2 = "2024-01-01"
endDate2 = "2024-12-31"

dateRange1 = pd.date_range(start=startDate1,end=endDate1)
dateRange2 = pd.date_range(start=startDate2,end=endDate2)

saveFile = "filtered_en_wiki_data.csv"
ourEntries = []

for dateRange in [dateRange1, dateRange2]:

    for date in dateRange.to_list():
        date = str(date.date())

        URL = f"{baseURL}/{date}.tsv"
        #print(URL)

        response = requests.get(URL, 
                                headers={"User-agent":"Wikipedia Bot, student project"})
        
        if response.status_code == 200:
            data = io.StringIO(response.text)
        else:
            print(f"Error: {response.status_code} for date={date}")
            continue

        for entry in data:

            # Only English Wikipedias
            values = entry.strip("\n").split('\t')
            #print(values)
            if values[2].strip() == 'en.wikipedia' and values[0].strip() == 'United Kingdom':
                values = [date] + values
                ourEntries.append(values)


columns = ["date", "country", "country_code", "project", "page_id", "article", "qid", "views"]
df = pd.DataFrame(ourEntries, columns=columns)

df["views"] = pd.to_numeric(df["views"], errors="coerce").fillna(0).astype(int)
df["date"] = pd.to_datetime(df["date"])

info = df.groupby("article", as_index=False)["views"].sum()

top_articles = info.sort_values("views", ascending=False).head(10000)["article"]

filtered_df = df[df["article"].isin(top_articles)]

filtered_df.to_csv("top_uk.csv", index=False)
filtered_df.head()

Unnamed: 0,date,country,country_code,project,page_id,article,qid,views
0,2023-02-06,United Kingdom,GB,en.wikipedia,102994,Bill_Murray,Q29250,310
1,2023-02-06,United Kingdom,GB,en.wikipedia,10577056,Ashes_to_Ashes_(British_TV_series),Q725195,197
3,2023-02-06,United Kingdom,GB,en.wikipedia,11500785,Red_Dead_Redemption,Q548203,159
7,2023-02-06,United Kingdom,GB,en.wikipedia,12577850,Victor_Moses,Q295637,170
12,2023-02-06,United Kingdom,GB,en.wikipedia,149612,Sertraline,Q407617,482
