# Pandas 
## Document key learning over and above learnt from PDA and Python 101

In [4]:
import numpy as np
import pandas as pd

## Series Data structure 
- Is like a dictionary but an ordered form of dictionary
- Computation is much faster than comparable sequences like lists 
- Because Data is stored underneath using numpy arrays

In [5]:
# Series takes in data in form of list, array, dics etc

In [6]:
pd.Series(np.array([1,2,3])) # data type is recognized as int

0    1
1    2
2    3
dtype: int64

In [9]:
x = pd.Series(['a', 'b', 'd']) # notice data type for list of strings is object

In [10]:
x

0    a
1    b
2    d
dtype: object

In [13]:
x.index # default integer index

RangeIndex(start=0, stop=3, step=1)

In [14]:
x.values

array(['a', 'b', 'd'], dtype=object)

In [15]:
y  = pd.Series(['a', 'b', None]) # None is converted and th values are stored as dtype object

In [16]:
y

0       a
1       b
2    None
dtype: object

In [29]:
type(y.values[2])

NoneType

In [30]:
type(None)

NoneType

In [31]:
z  = pd.Series([1, 3, None]) # Now the None is stored as NaN, which is a numeric value, overall object is float64

In [32]:
z

0    1.0
1    3.0
2    NaN
dtype: float64

In [33]:
np.isnan(z.values[2])

True

In [39]:
np.nan == None # nan is not same as None. None is a different type (NoneType) than nan(float). They represent 
# absence of a value, and non numeric value resp.

False

In [40]:
type(np.nan)

float

In [41]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [43]:
# If you specify a subset of index values, when converting a dictionary, values are converted to NaN
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s

Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

# Querying a Series

## In pandas slicing (use of [ ]) creates views, indexing (using loc anc iloc) creates copies

In [121]:
# Use of loc and iloc
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

## loc and iloc are not methods, but attributes, they use [] and not ()

In [108]:
s.iloc[[0,2,3]]

Archery           Bhutan
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [109]:
s.loc[['Golf', 'Sumo']]

Golf    Scotland
Sumo       Japan
dtype: object

## Slicing a view, and making a change will broadcast change ( true for pandas - i.e even for Dataframe)

In [122]:
t = s['Archery':'Sumo']
t['Sumo'] = 'Jap'
t

Archery      Bhutan
Golf       Scotland
Sumo            Jap
dtype: object

In [123]:
s

Archery           Bhutan
Golf            Scotland
Sumo                 Jap
Taekwondo    South Korea
dtype: object

## Indexing using loc will create copy

In [125]:
t = s.loc[['Archery','Golf','Sumo']]
t['Sumo'] = 'Japan'
t

Archery      Bhutan
Golf       Scotland
Sumo          Japan
dtype: object

In [126]:
s

Archery           Bhutan
Golf            Scotland
Sumo                 Jap
Taekwondo    South Korea
dtype: object

In [49]:
# Explicitly stating index is bettr 
# If index is integer, pandas throws error
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)
s

99          Bhutan
100       Scotland
101          Japan
102    South Korea
dtype: object

In [51]:
#s[0] throws an error

## Using vectorized methods to compute series 
- numpy and pandas methods using vectorization, to make computations fast 
- Avoid using loops, where possible

In [52]:
# Demonstration of adding elements of a series
x = pd.Series(np.random.randn(1000))

In [53]:
x.head()

0    0.633458
1   -0.199338
2   -0.336314
3   -0.094835
4   -0.575042
dtype: float64

#### %timeit mgic function computes average time to compute an expression running it with repetitions
#### %%timeit evaluates expressions in a jupyter cell

In [56]:
%%timeit -n 100
np.sum(x)

50.3 µs ± 15.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [59]:
%%timeit -n 100
s = 0
for i in range(len(x)):
    s += x.iloc[i]

8.27 ms ± 282 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Series values and Index can have mixed types. No error is thrown, coercion takes place
## As data is added to series, the dtype of series changes

In [65]:
s = pd.Series([1, 2, 3])
s

0    1
1    2
2    3
dtype: int64

In [66]:
s.loc['Animal'] = 'Bears'
s

0             1
1             2
2             3
Animal    Bears
dtype: object

## Index can hve Duplicate values, it is not supposed to be unique

#### Series can be appended, Notice that append mthod on a Pandas series does not make an in place change, in python lists it does

In [70]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [71]:
original_sports

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [73]:
all_countries

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

# Data Frame Data Structure
- index attribute is is for rows, columns is for columns, both are index data structures, which implies
they need not be unique. 
- Rows and columns are easily interchangeable using transpose 

In [75]:
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


### .loc and .iloc support mixed selection, and create  copy, however slicing using [ ] creates views. 
### Creating views only is fast and memory efficient, but changes to the view are broadcast

In [77]:
# Column selection
df.iloc[:,[0,1]]

Unnamed: 0,Cost,Item Purchased
Store 1,22.5,Dog Food
Store 1,2.5,Kitty Litter
Store 2,5.0,Bird Seed


In [80]:
df.loc[:,['Cost', 'Name']]

Unnamed: 0,Cost,Name
Store 1,22.5,Chris
Store 1,2.5,Kevyn
Store 2,5.0,Vinod


In [85]:
# Row selection
df.loc[['Store 1', 'Store 2'],:]

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [86]:
# Filtering
df.loc[df['Cost'] > 5,:]

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Dog Food,Chris


In [89]:
# Mixed selection
x = df.loc[['Store 1'],['Cost', 'Name']]
x

Unnamed: 0,Cost,Name
Store 1,22.5,Chris
Store 1,2.5,Kevyn


In [90]:
# Col., row, mixed selection all return copies
x[x['Cost'] >5] = 0
x

Unnamed: 0,Cost,Name
Store 1,0.0,0
Store 1,2.5,Kevyn


In [91]:
df

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [92]:
# Dropping a column or row, being aware of view/copy
df.drop('Store 1') # return a copy

Unnamed: 0,Cost,Item Purchased,Name
Store 2,5.0,Bird Seed,Vinod


In [93]:
df

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [95]:
del df['Name']

In [96]:
df # in place drop

Unnamed: 0,Cost,Item Purchased
Store 1,22.5,Dog Food
Store 1,2.5,Kitty Litter
Store 2,5.0,Bird Seed


## Slicing

In [98]:
y = df['Cost']
y

Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [99]:
y+=2
y

Store 1    24.5
Store 1     4.5
Store 2     7.0
Name: Cost, dtype: float64

In [100]:
df

Unnamed: 0,Cost,Item Purchased
Store 1,24.5,Dog Food
Store 1,4.5,Kitty Litter
Store 2,7.0,Bird Seed


## Indexing

In [103]:
temp = df.loc[:,['Cost',]]
temp

Unnamed: 0,Cost
Store 1,24.5
Store 1,4.5
Store 2,7.0


In [104]:
temp -= 2
temp

Unnamed: 0,Cost
Store 1,22.5
Store 1,2.5
Store 2,5.0


In [105]:
df

Unnamed: 0,Cost,Item Purchased
Store 1,24.5,Dog Food
Store 1,4.5,Kitty Litter
Store 2,7.0,Bird Seed


## Data Loading using read_csv

In [130]:
dir = '/Users/sumad/Documents/DS/Python/UM Spcialization/DS_with_Python/'

In [134]:
df = pd.read_csv(dir + 'olympics.csv')

In [135]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,# Summer,01 !,02 !,03 !,Total,# Winter,01 !,02 !,03 !,Total,# Games,01 !,02 !,03 !,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12


In [140]:
# dropping rows, specifying 
df = pd.read_csv(dir + 'olympics.csv', index_col = 0,skiprows = 1) # account for skip rows and give index_col

In [141]:
df.head()

Unnamed: 0,# Summer,01 !,02 !,03 !,Total,# Winter,01 !.1,02 !.1,03 !.1,Total.1,# Games,01 !.2,02 !.2,03 !.2,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


In [145]:
# Rename columns in place, specifying as dictionary
for col in df.columns:
    if(col[0:2]=='01'):
        df.rename(columns= {col : 'Gold' + col[4:]},inplace= True)
    if(col[0:2]=='02'):
        df.rename(columns= {col : 'Silver' + col[4:]}, inplace= True)
    if(col[0:2]=='03'):
        df.rename(columns= {col : 'Bronze' + col[4:]}, inplace= True)
df.head()        

Unnamed: 0,# Summer,Gold,Silver,Bronze,Total,# Winter,Gold.1,Silver.1,Bronze.1,Total.1,# Games,Gold.2,Silver.2,Bronze.2,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


## Filtering using Boolean Masking 
- Boolean masking results in fast pandas operations, often made part of thr workflow for this reason

In [151]:
mask = df['Gold'] > 0 
mask

Afghanistan (AFG)                               False
Algeria (ALG)                                    True
Argentina (ARG)                                  True
Armenia (ARM)                                    True
Australasia (ANZ) [ANZ]                          True
Australia (AUS) [AUS] [Z]                        True
Austria (AUT)                                    True
Azerbaijan (AZE)                                 True
Bahamas (BAH)                                    True
Bahrain (BRN)                                   False
Barbados (BAR) [BAR]                            False
Belarus (BLR)                                    True
Belgium (BEL)                                    True
Bermuda (BER)                                   False
Bohemia (BOH) [BOH] [Z]                         False
Botswana (BOT)                                  False
Brazil (BRA)                                     True
British West Indies (BWI) [BWI]                 False
Bulgaria (BUL) [H]          

In [150]:
type(mask)

pandas.core.series.Series

In [152]:
df1 = df[mask]
print(len(df))
len(df1)

147


100

In [154]:
# Alternatively Using where method
df2 = df.where(df['Gold'] > 0)

In [155]:
len(df2)

147

In [160]:
df2.head() # where method results in NaNs in thr rows where condition is not met, required dropping methods
df3 = df2.dropna(axis= 0)
len(df3)

100

### Boolean masks can be chained using bitwise operators to form more complex filtering mechanism that give fast result