In [4]:
import numpy as np
import pandas as pd

pd.__version__

'1.2.4'

In [44]:
"""The Pandas Series Object"""
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(type(data))
print(data)
print(data.index)
print(data.values)
print("\n>> data[1] =", data[1], end="\n\n")
print(data[1:3])

<class 'pandas.core.series.Series'>
0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
RangeIndex(start=0, stop=4, step=1)
[0.25 0.5  0.75 1.  ]

>> data[1] = 0.5

1    0.50
2    0.75
dtype: float64


In [52]:
# New indexing = explicit indexing in Pandas
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 'b', 'c', 'd'])
print(data, end="\n\n")
print(data['b':'d'], end="\n\n")
print(data['b'])

2    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

b    0.50
c    0.75
d    1.00
dtype: float64

0.5


In [49]:
# We can use python dict / list to create Pandas Series
population_dict = {'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,
                    'Florida': 19552860,
                    'Illinois': 12882135}

population = pd.Series(population_dict)
print(population, end="\n\n")
print(population['California'], end="\n\n")
print(population['California':'New York'])

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

38332521

California    38332521
Texas         26448193
New York      19651127
dtype: int64


In [58]:
"""The Pandas DataFrame Object"""
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, } # 'Illinois': 149995
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
dtype: int64

In [59]:
states = pd.DataFrame({'population': population, 'area': area}) # it automatically puts index
states

Unnamed: 0,population,area
California,38332521,423967.0
Florida,19552860,170312.0
Illinois,12882135,
New York,19651127,141297.0
Texas,26448193,695662.0


In [56]:
print(states.index)
print(states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [60]:
states['area']

California    423967.0
Florida       170312.0
Illinois           NaN
New York      141297.0
Texas         695662.0
Name: area, dtype: float64

In [64]:
"""Data Selection in Series"""
# Series as one-dimensional array
# slicing by explicit index
data

2    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [66]:
print(data['b':'d'])
print()
print(data[1:3])

b    0.50
c    0.75
d    1.00
dtype: float64

b    0.50
c    0.75
dtype: float64


In [67]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [70]:
# fancy indexing
data[['b', 'c']]

b    0.50
c    0.75
dtype: float64

In [73]:
"""Indexers: loc, iloc, and ix"""

data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data)

1    a
3    b
5    c
dtype: object


In [84]:
print(data[1]) # explicit index when indexing
print(data.loc[1]) # explicit
print(data.iloc[1]) # implicit

a
a
b


In [85]:
print(data[1:3]) # implicit index when slicing
print()
print(data.loc[1:3]) # explicit
print()
print(data.iloc[1:3]) # implicit

3    b
5    c
dtype: object

1    a
3    b
dtype: object

3    b
5    c
dtype: object


In [86]:
"""Data Selection in DataFrame"""
# DataFrame as a dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})

pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})

data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [87]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [88]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [90]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [99]:
data.loc['California':'New York']

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [101]:
data.iloc[0:2]

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874


In [102]:
# DataFrame as two-dimensional array
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [104]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [106]:
 data.loc[:'New York', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [110]:
 data.loc[data.density > 100, ['pop', 'density']] # fancy indexing of NumPy works 

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [111]:
data[data.density > 100] # masking

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [15]:
"""COVID Data Analysis"""

import pandas as pd

df = pd.read_csv("https://www.sololearn.com/uploads/ca-covid.csv")

df.drop('state', axis=1, inplace=True)
df.set_index('date', inplace=True)

df['ratio'] = df['deaths'] / df['cases']

largest = df.loc[df['ratio'] == df['ratio'].max()] # df.loc[df['ratio'].max()] we cannot do that
print(largest)

          cases  deaths     ratio
date                             
10.03.20      7       1  0.142857


In [16]:
df.size

1026