In [11]:
import pandas as pd
from urllib.request import urlretrieve

italy_covid_url = 'https://gist.githubusercontent.com/aakashns/f6a004fa20c84fec53262f9a8bfee775/raw/f309558b1cf5103424cef58e2ecb8704dcd4d74c/italy-covid-daywise.csv'

urlretrieve(italy_covid_url, 'italy-covid-daywise.csv')
# pd.read_csv - Read data from a CSV file into a Pandas DataFrame object
covid_df = pd.read_csv('italy-covid-daywise.csv')
print(type(covid_df))
print(covid_df)

<class 'pandas.core.frame.DataFrame'>
           date  new_cases  new_deaths  new_tests
0    2019-12-31        0.0         0.0        NaN
1    2020-01-01        0.0         0.0        NaN
2    2020-01-02        0.0         0.0        NaN
3    2020-01-03        0.0         0.0        NaN
4    2020-01-04        0.0         0.0        NaN
..          ...        ...         ...        ...
243  2020-08-30     1444.0         1.0    53541.0
244  2020-08-31     1365.0         4.0    42583.0
245  2020-09-01      996.0         6.0    54395.0
246  2020-09-02      975.0         8.0        NaN
247  2020-09-03     1326.0         6.0        NaN

[248 rows x 4 columns]


In [12]:
# .info() - View basic infomation about rows, columns & data types
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        248 non-null    object 
 1   new_cases   248 non-null    float64
 2   new_deaths  248 non-null    float64
 3   new_tests   135 non-null    float64
dtypes: float64(3), object(1)
memory usage: 7.9+ KB


In [13]:
# .describe() - View statistical information about numeric columns
covid_df.describe()

Unnamed: 0,new_cases,new_deaths,new_tests
count,248.0,248.0,135.0
mean,1094.818548,143.133065,31699.674074
std,1554.508002,227.105538,11622.209757
min,-148.0,-31.0,7841.0
25%,123.0,3.0,25259.0
50%,342.0,17.0,29545.0
75%,1371.75,175.25,37711.0
max,6557.0,971.0,95273.0


In [14]:
# .columns - Get the list of column names
covid_df.columns

Index(['date', 'new_cases', 'new_deaths', 'new_tests'], dtype='object')

In [17]:
# .shape - Get the number of rows & columns as a tuple
covid_df.shape

(248, 4)

In [29]:
# New Chapter

# Retrieving data from a data frame

'''
The first thing you might want to do is retrieve data from this data frame, e.g., the counts of a specific day or 
the list of values in a particular column. To do this, it might help to understand the internal representation of 
data in a data frame. Conceptually, you can think of a dataframe as a dictionary of lists: keys are column names,
and values are lists/arrays containing data for the respective columns.

Representing data in the below format has a few benefits:

    All values in a column typically have the same type of value, so it's more efficient to store them in a single array.
    Retrieving the values for a particular row simply requires extracting the elements at a given index from each column 
    array.
    The representation is more compact (column names are recorded only once) compared to other formats that use a 
    dictionary for each row of data (see the example below).

'''
# Pandas format is simliar to this
covid_data_dict = {
    'date':       ['2020-08-30', '2020-08-31', '2020-09-01', '2020-09-02', '2020-09-03'],
    'new_cases':  [1444, 1365, 996, 975, 1326],
    'new_deaths': [1, 4, 6, 8, 6],
    'new_tests': [53541, 42583, 54395, None, None]
}
print(covid_data_dict['date'][0])
print(covid_data_dict['new_cases'])
print(covid_df['new_cases'])
print(type(covid_df['new_cases'])) # <class 'pandas.core.series.Series'>
print(type(covid_data_dict['new_cases'])) # <class 'list'>
print(covid_df['new_cases'])
print(covid_df['new_cases'][246]) #975

2020-08-30
[1444, 1365, 996, 975, 1326]
0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
        ...  
243    1444.0
244    1365.0
245     996.0
246     975.0
247    1326.0
Name: new_cases, Length: 248, dtype: float64
<class 'pandas.core.series.Series'>
<class 'list'>
0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
        ...  
243    1444.0
244    1365.0
245     996.0
246     975.0
247    1326.0
Name: new_cases, Length: 248, dtype: float64
975.0


In [32]:
# Pandas also provides the .at method to retrieve the element at a specific row & column directly.

covid_df.at[243, "new_cases"]

1444.0

In [33]:
# Instead of using the indexing notation [], Pandas also allows accessing columns as properties of the dataframe using 
# the . notation. However, this method only works for columns whose names do not contain spaces or special characters.

covid_df.new_cases

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
        ...  
243    1444.0
244    1365.0
245     996.0
246     975.0
247    1326.0
Name: new_cases, Length: 248, dtype: float64

In [39]:
# we can print multiple columns as well by passing a list as argument

casesDeath = covid_df[["new_cases","new_deaths"]]
# it is a view of original data if we make changes in this the original data will also changed.

# copy()
covid_df_copy = covid_df.copy()
print(covid_df_copy)
casesDeathCopy = covid_df[["new_cases","new_deaths"]]
print(casesDeathCopy)

           date  new_cases  new_deaths  new_tests
0    2019-12-31        0.0         0.0        NaN
1    2020-01-01        0.0         0.0        NaN
2    2020-01-02        0.0         0.0        NaN
3    2020-01-03        0.0         0.0        NaN
4    2020-01-04        0.0         0.0        NaN
..          ...        ...         ...        ...
243  2020-08-30     1444.0         1.0    53541.0
244  2020-08-31     1365.0         4.0    42583.0
245  2020-09-01      996.0         6.0    54395.0
246  2020-09-02      975.0         8.0        NaN
247  2020-09-03     1326.0         6.0        NaN

[248 rows x 4 columns]
     new_cases  new_deaths
0          0.0         0.0
1          0.0         0.0
2          0.0         0.0
3          0.0         0.0
4          0.0         0.0
..         ...         ...
243     1444.0         1.0
244     1365.0         4.0
245      996.0         6.0
246      975.0         8.0
247     1326.0         6.0

[248 rows x 2 columns]


In [40]:
# To access a specific row of data, Pandas provides the .loc method.
covid_df.loc[244]

date          2020-08-31
new_cases         1365.0
new_deaths           4.0
new_tests        42583.0
Name: 244, dtype: object

In [41]:
type(covid_df.loc[244])

pandas.core.series.Series

In [42]:
# We can use the .head and .tail methods to view the first or last few rows of data.

covid_df.head(5)


Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,


In [45]:
covid_df.tail()
# by default it prints 5 rows

Unnamed: 0,date,new_cases,new_deaths,new_tests
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,
247,2020-09-03,1326.0,6.0,


In [50]:
# pd.loc()
covid_df.loc[240:245]


Unnamed: 0,date,new_cases,new_deaths,new_tests
240,2020-08-27,1366.0,13.0,57640.0
241,2020-08-28,1409.0,5.0,65135.0
242,2020-08-29,1460.0,9.0,64294.0
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0


In [60]:
# .sample()
# We can use the .sample method to retrieve a random sample of rows from the data frame.

covid_df.sample() # prints random any one row


Unnamed: 0,date,new_cases,new_deaths,new_tests
206,2020-07-24,306.0,10.0,28970.0


In [61]:
covid_df.sample(5)
# Notice that even though we have taken a random sample, each row's original index is preserved - this is a useful 
# property of data frames.


Unnamed: 0,date,new_cases,new_deaths,new_tests
99,2020-04-08,3039.0,604.0,
125,2020-05-04,1389.0,174.0,22999.0
194,2020-07-12,188.0,7.0,23061.0
235,2020-08-22,947.0,9.0,46613.0
240,2020-08-27,1366.0,13.0,57640.0


In [65]:
# Here's a summary of the functions & methods we looked at in this section:

print(covid_df['new_cases']) #- Retrieving columns as a Series using the column name
print(covid_df.new_cases[243]) #- Retrieving values from a Series using an index

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
        ...  
243    1444.0
244    1365.0
245     996.0
246     975.0
247    1326.0
Name: new_cases, Length: 248, dtype: float64
1444.0


In [None]:
# covid_df.at[243, 'new_cases'] - Retrieving a single value from a data frame
# covid_df.copy() - Creating a deep copy of a data frame
# covid_df.loc[243] - Retrieving a row or range of rows of data from the data frame
# head, tail, and sample - Retrieving multiple rows of data from the data frame
# covid_df.new_tests.first_valid_index - Finding the first non-empty index in a series