In [1]:
import pandas as pd
import json

PATH = "COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/"

with open("COVID-19-master/csse_covid_19_data/country_convert.json", "r", encoding="utf-8-sig") as json_file :
    json_data = json.load(json_file)

def country_name_convert(row) :
    if row["Country_Region"] in json_data :
        return json_data[row["Country_Region"]]
    return row["Country_Region"]

def create_dateframe(filename) :
    doc = pd.read_csv(f"{PATH}{filename}", encoding="utf-8-sig")

    try :
        doc = doc[["Country_Region", "Confirmed"]]
    except :
        doc = doc[["Country/Region", "Confirmed"]]
        doc.columns = ["Country_Region", "Confirmed"]
        
    doc = doc.dropna(subset=["Confirmed"])
    doc["Country_Region"] = doc.apply(country_name_convert, axis = 1)
    doc = doc.astype({"Confirmed": "int64"})
    doc = doc.groupby("Country_Region").sum()

    date_column = filename.split(".")[0].lstrip("0").replace("-", "/")
    doc.columns = [date_column]
    return doc

In [2]:
doc1 = create_dateframe("01-22-2020.csv")
doc2 = create_dateframe("04-01-2020.csv")

In [3]:
doc2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 190 entries, Afghanistan to Zimbabwe
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   4/01/2020  190 non-null    int64
dtypes: int64(1)
memory usage: 3.0+ KB


In [4]:
doc2.head()

Unnamed: 0_level_0,4/01/2020
Country_Region,Unnamed: 1_level_1
Afghanistan,192
Albania,259
Algeria,847
Andorra,390
Angola,8


In [5]:
doc1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19 entries, Antarctica to Winter Olympics 2022
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   1/22/2020  19 non-null     int64
dtypes: int64(1)
memory usage: 304.0+ bytes


In [6]:
doc1.head()

Unnamed: 0_level_0,1/22/2020
Country_Region,Unnamed: 1_level_1
Antarctica,0
China,548
Japan,2
Kiribati,0
"Korea, North",0


In [7]:
doc = pd.merge(doc1, doc2, how="outer", left_index=True, right_index=True)
doc.head()

Unnamed: 0_level_0,1/22/2020,4/01/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,,192
Albania,,259
Algeria,,847
Andorra,,390
Angola,,8


In [8]:
doc = doc.fillna(0)
doc

Unnamed: 0_level_0,1/22/2020,4/01/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.0,192
Albania,0.0,259
Algeria,0.0,847
Andorra,0.0,390
Angola,0.0,8
...,...,...
Vietnam,0.0,218
West Bank and Gaza,0.0,134
Winter Olympics 2022,0.0,0
Zambia,0.0,36


In [9]:
import os

PATH = "COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/"
# file_list = os.listdir(PATH)
# csv_list = list()
file_list, csv_list = os.listdir(PATH), list() # 구조분해할당

for file in file_list :
    if file.split(".")[-1] == "csv" :
        csv_list.append(file)

print(csv_list)

['01-01-2021.csv', '01-01-2022.csv', '01-01-2023.csv', '01-02-2021.csv', '01-02-2022.csv', '01-02-2023.csv', '01-03-2021.csv', '01-03-2022.csv', '01-03-2023.csv', '01-04-2021.csv', '01-04-2022.csv', '01-04-2023.csv', '01-05-2021.csv', '01-05-2022.csv', '01-05-2023.csv', '01-06-2021.csv', '01-06-2022.csv', '01-06-2023.csv', '01-07-2021.csv', '01-07-2022.csv', '01-07-2023.csv', '01-08-2021.csv', '01-08-2022.csv', '01-08-2023.csv', '01-09-2021.csv', '01-09-2022.csv', '01-09-2023.csv', '01-10-2021.csv', '01-10-2022.csv', '01-10-2023.csv', '01-11-2021.csv', '01-11-2022.csv', '01-11-2023.csv', '01-12-2021.csv', '01-12-2022.csv', '01-12-2023.csv', '01-13-2021.csv', '01-13-2022.csv', '01-13-2023.csv', '01-14-2021.csv', '01-14-2022.csv', '01-14-2023.csv', '01-15-2021.csv', '01-15-2022.csv', '01-15-2023.csv', '01-16-2021.csv', '01-16-2022.csv', '01-16-2023.csv', '01-17-2021.csv', '01-17-2022.csv', '01-17-2023.csv', '01-18-2021.csv', '01-18-2022.csv', '01-18-2023.csv', '01-19-2021.csv', '01-19-20

In [10]:
from datetime import datetime

# 어떤 기능을 구현할 수 있는 무언가가 필요하다!!
# 기능 => 함수
# def iam() :
# "식"의 형태로 간단하게 사용하고 싶어!! => lambda 함수식 => 익명함수



csv_list.sort(key = lambda x : datetime.strptime(x, "%m-%d-%Y.csv"))
csv_list

['01-22-2020.csv',
 '01-23-2020.csv',
 '01-24-2020.csv',
 '01-25-2020.csv',
 '01-26-2020.csv',
 '01-27-2020.csv',
 '01-28-2020.csv',
 '01-29-2020.csv',
 '01-30-2020.csv',
 '01-31-2020.csv',
 '02-01-2020.csv',
 '02-02-2020.csv',
 '02-03-2020.csv',
 '02-04-2020.csv',
 '02-05-2020.csv',
 '02-06-2020.csv',
 '02-07-2020.csv',
 '02-08-2020.csv',
 '02-09-2020.csv',
 '02-10-2020.csv',
 '02-11-2020.csv',
 '02-12-2020.csv',
 '02-13-2020.csv',
 '02-14-2020.csv',
 '02-15-2020.csv',
 '02-16-2020.csv',
 '02-17-2020.csv',
 '02-18-2020.csv',
 '02-19-2020.csv',
 '02-20-2020.csv',
 '02-21-2020.csv',
 '02-22-2020.csv',
 '02-23-2020.csv',
 '02-24-2020.csv',
 '02-25-2020.csv',
 '02-26-2020.csv',
 '02-27-2020.csv',
 '02-28-2020.csv',
 '02-29-2020.csv',
 '03-01-2020.csv',
 '03-02-2020.csv',
 '03-03-2020.csv',
 '03-04-2020.csv',
 '03-05-2020.csv',
 '03-06-2020.csv',
 '03-07-2020.csv',
 '03-08-2020.csv',
 '03-09-2020.csv',
 '03-10-2020.csv',
 '03-11-2020.csv',
 '03-12-2020.csv',
 '03-13-2020.csv',
 '03-14-2020

In [13]:
import os
import pandas as pd
from datetime import datetime

def generate_dateframe_by_path(PATH) :
    file_list, csv_list = os.listdir(PATH), list()
    first_doc = True

    for file in file_list :
        if file.split(".")[-1] == "csv" :
            csv_list.append(file)
    csv_list.sort(key= lambda x : datetime.strptime(x, "%m-%d-%Y.csv"))

    for file in csv_list :
        doc = create_dateframe(file)
        if first_doc :
            final_doc, first_doc = doc, False
        else :
            final_doc = pd.merge(final_doc, doc, how="outer", left_index=True, right_index=True)
    final_doc = final_doc.fillna(0)
    return final_doc

In [14]:
PATH = "COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/"
doc = generate_dateframe_by_path(PATH)

doc

Unnamed: 0_level_0,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,1/28/2020,1/29/2020,1/30/2020,1/31/2020,...,2/28/2023,3/01/2023,3/02/2023,3/03/2023,3/04/2023,3/05/2023,3/06/2023,3/07/2023,3/08/2023,3/09/2023
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,209322.0,209340.0,209358.0,209362.0,209369.0,209390.0,209406.0,209436.0,209451.0,209451.0
Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,334391.0,334408.0,334408.0,334427.0,334427.0,334427.0,334427.0,334427.0,334443.0,334457.0
Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,271441.0,271448.0,271463.0,271469.0,271469.0,271477.0,271477.0,271490.0,271494.0,271496.0
Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,47866.0,47875.0,47875.0,47875.0,47875.0,47875.0,47875.0,47875.0,47890.0,47890.0
Angola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,105255.0,105277.0,105277.0,105277.0,105277.0,105277.0,105277.0,105277.0,105288.0,105288.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
West Bank and Gaza,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,703228.0,703228.0,703228.0,703228.0,703228.0,703228.0,703228.0,703228.0,703228.0,703228.0
Winter Olympics 2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0,535.0
Yemen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11945.0,11945.0,11945.0,11945.0,11945.0,11945.0,11945.0,11945.0,11945.0,11945.0
Zambia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,343012.0,343012.0,343079.0,343079.0,343079.0,343135.0,343135.0,343135.0,343135.0,343135.0


In [16]:
doc.to_csv("COVID-19-master/students_default.csv")