# Introducing pandas
- pandas helps us to explore, clean, and process the data. In pandas, a data table is called a DataFrame
- pandas mainly provdies two data structures, __Series__ and __DataFrame
  - Series is a 1D array-like object that can hold data of any type with index. 
  - Dataframe is a 2D table with rows and columns, e.g. csv(comma-separated values) and RDBMS(relational database management system)
  - __Series is the data structure for a single column of a DataFrame__
  - https://stackoverflow.com/questions/26047209/what-is-the-difference-between-a-pandas-series-and-a-single-column-dataframe
- good for data cleaning before feeding into model


In [1]:
import numpy as np
import pandas as pd
np.__version__, pd.__version__

('1.24.3', '1.5.3')

## The pandas Series object
- list vs. array vs. Series: __array-like__

In [4]:
# float
lst1 = [0.25, 0.5, -0.75, 1] # list
np_array = np.array([0.25, 0.5, -0.75, 1]) # array

ser1 = pd.Series([0.25, 0.5, -0.75, 1]) # list to Series
ser2 = pd.Series(np.array([0.25, 0.5, -0.75, 1])) # array to Series

print(f'lst1\t\t {lst1}\n'
      f'np_array\t= {np_array}\n\n'
      f'ser1 =\n{ser1}\n\n'
      f'type(ser1) : {type(ser1)}\n\n'
      f'ser2 =\n{ser2}')

lst1		 [0.25, 0.5, -0.75, 1]
np_array	= [ 0.25  0.5  -0.75  1.  ]

ser1 =
0    0.25
1    0.50
2   -0.75
3    1.00
dtype: float64

type(ser1) : <class 'pandas.core.series.Series'>

ser2 =
0    0.25
1    0.50
2   -0.75
3    1.00
dtype: float64


## Boolean / Logical & Arithmetic operation

In [7]:
print(f'ser1 =\n{ser1}\n\n'
      f'ser1[ser1>0] =\n{ser1[ser1>0]}\n\n' # filter out the element less than 0
      f'ser1*2 =\n{ser1*2}\n\n'
      f'np.exp(ser1) =\n{np.exp(ser1)}')

ser1 =
0    0.25
1    0.50
2   -0.75
3    1.00
dtype: float64

ser1[ser1>0] =
0    0.25
1    0.50
3    1.00
dtype: float64

ser1*2 =
0    0.5
1    1.0
2   -1.5
3    2.0
dtype: float64

np.exp(ser1) =
0    1.284025
1    1.648721
2    0.472367
3    2.718282
dtype: float64


## Membership Operators

In [10]:
print(f'{ser1}\n\n'
      f'2 in ser1\t: {2 in ser1}\n' # 2 is index, not value
      f'0.5 in ser1\t: {0.5 in ser1}') #1, 2 are indexes, not values

0    0.25
1    0.50
2   -0.75
3    1.00
dtype: float64

2 in ser1	: True
0.5 in ser1	: False


## dtype: object, showing that some element is not numbers

In [11]:
ser1 = pd.Series([0.25, 0.5, 'hello', 1.0])
ser1, print(ser1)

0     0.25
1      0.5
2    hello
3      1.0
dtype: object


(0     0.25
 1      0.5
 2    hello
 3      1.0
 dtype: object,
 None)

## __Series__ as generalized __Numpy Array__ (key like dictionary)

In [14]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])
# data = pd.Series([0.25, 0.5, 0.75, 1.0], index=list('a','b','c','d'))
data_np = np.array([0.25, 0.5, 0.75, 1.0])
print(f'data =\n{data}\n\n'
     f'data_np\t= {data_np}\n'
     f"data['b']\t= {data['b']}\n" # like dictionary where given a key, retrieve a value
     f'data_np[2]\t={data_np[2]}\n')

data =
a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

data_np	= [0.25 0.5  0.75 1.  ]
data['b']	= 0.5
data_np[2]	=0.75



In [15]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = [2,5,3,7])
data # index by numbers(as strings)

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

## dict. to Series(key as index)

In [17]:
data = pd.Series({'A': 'apple', 'B': 'banana', 'C': 'cherry'})
data, data['A':'B'] # retrieving the value of 'A' and 'B' using slicing

(A     apple
 B    banana
 C    cherry
 dtype: object,
 A     apple
 B    banana
 dtype: object)

## __Series__ as specialized dictionary

In [20]:
population_dict = {'California': 38332521, 'Texas' : 26448193, 
                    'New York' : 19651127, 'Florida' : 19552860,
                    'Illinois' : 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [23]:
# Given a key, retrieve a value
population['California'], population_dict['Texas']

(38332521, 26448193)

## slice: only for Series, not dict.

In [26]:
print(f'{population}\n\n'
      f"{population['California' : 'New York']}") # slice from cali to ny

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

California    38332521
Texas         26448193
New York      19651127
dtype: int64


## Series Attributes

In [46]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)
print(data.values)
print(data.index)
print(data.shape)
print(data.ndim)
print(data.dtypes)
print(data.size)
print(data.nbytes)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)
(4,)
1
float64
4
32


## Index vs. Slice

In [47]:
print(f'data[1]\t=\n{data}\n\n'
      f'data[1]\t= {data[1]}\n\n'
      f'data[1:3] =\n{data[1:3]}')

data[1]	=
0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

data[1]	= 0.5

data[1:3] =
1    0.50
2    0.75
dtype: float64


## Axis Indexes with Duplicate Labels

In [48]:
obj = pd.Series(range(5), index=['a','a','b','b','c']) # index can be repetitive

# get all objects with index 'a'
print(f"{obj}\n\n{obj['a']}")

a    0
a    1
b    2
b    3
c    4
dtype: int64

a    0
a    1
dtype: int64


## check if index is unique
- index.is_unique

In [49]:
obj.index.is_unique # boolean showing that this obj has repetitive elements

False

## Series name is column name in DataFrame

In [50]:
data = pd.Series([0.25, 0.5, 0.75, 1.0, None, np.nan])
data.name = 'myseries'
data, pd.DataFrame(data), pd.DataFrame(data).columns

(0    0.25
 1    0.50
 2    0.75
 3    1.00
 4     NaN
 5     NaN
 Name: myseries, dtype: float64,
    myseries
 0      0.25
 1      0.50
 2      0.75
 3      1.00
 4       NaN
 5       NaN,
 Index(['myseries'], dtype='object'))

## Index as immutable array

In [53]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [54]:
print(f'{ind[1]}\n\n'
      f'{ind[::2]}')

3

Int64Index([2, 5, 11], dtype='int64')


In [56]:
ind[1] = 0 ## index does not support mutable operations

TypeError: Index does not support mutable operations

## Multi index

In [55]:
ind = pd.Index([('TP', 1), ('TP', 3), ('TY', 1), ('TY', 6), ('KS', 2)])
ind

MultiIndex([('TP', 1),
            ('TP', 3),
            ('TY', 1),
            ('TY', 6),
            ('KS', 2)],
           )

In [57]:
ind.size, ind.shape, ind.ndim, ind.dtype

(5, (5,), 1, dtype('O'))

## loc: __label__ indexing
## iloc: __positional__ indexing (integer-location, in which values can only be retrieved with integers)

## loc

In [59]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])

print(f'{data}\n\n'
      f'{data.loc[3]}\n\n'
      f'{data[1:3]}\n\n' # position index does not include 3
      f'{data.loc[1:3]}') # including 3 

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

1.0

1    0.50
2    0.75
dtype: float64

1    0.50
2    0.75
3    1.00
dtype: float64


## iloc

In [60]:
print(f'{data}\n\n'
      f'{data.iloc[3]}\n\n'
      f'{data[1:3]}\n\n' # position index does not include 3
      f'{data.iloc[1:3]}') # integer position, same as index

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

1.0

1    0.50
2    0.75
dtype: float64

1    0.50
2    0.75
dtype: float64


## index = ['5', '4', '3', '2']

In [65]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['5', '4', '3', '2'])
print(f'{data}\n\n'
      f"{data.loc['3']}\n\n"
      f"{data['3':'5':-1]}\n\n"
      f"{data.loc['5':'3']}") # including 3 

5    0.25
4    0.50
3    0.75
2    1.00
dtype: float64

0.75

3    0.75
4    0.50
5    0.25
dtype: float64

5    0.25
4    0.50
3    0.75
dtype: float64


In [66]:
print(f'{data}\n\n'
      f'{data.iloc[3]}\n\n'
      f'{data.iloc[1:3]}') # not including index 3

5    0.25
4    0.50
3    0.75
2    1.00
dtype: float64

1.0

4    0.50
3    0.75
dtype: float64


## DataFrame is just like excel spreadsheet, formed with rows and columns
- DataFrame is numpy based, so it's faster than python
- axis direction matters
- [row_indexer, column_indexer]

In [68]:
# This is a dictionary
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002, 2003],
        'popu': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
print(f"data :\n{data}\n\n"
      f"data['state'] : {data['state']}\n\n"
      f"{pd.DataFrame(data)}")
pd.DataFrame(data)['state'] # call the column

data :
{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'popu': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

data['state'] : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada']

    state  year  popu
0    Ohio  2000   1.5
1    Ohio  2001   1.7
2    Ohio  2002   3.6
3  Nevada  2001   2.4
4  Nevada  2002   2.9
5  Nevada  2003   3.2


0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

## df, df.T

In [70]:
df = pd.DataFrame(data, index=['d','f','g','h','j','k'])
print(f'{df}\n\n{df.T}')

    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4
j  Nevada  2002   2.9
k  Nevada  2003   3.2

          d     f     g       h       j       k
state  Ohio  Ohio  Ohio  Nevada  Nevada  Nevada
year   2000  2001  2002    2001    2002    2003
popu    1.5   1.7   3.6     2.4     2.9     3.2


## Attribute

In [71]:
print(f'df :\n{df}\n\n'
      f'df.index\t = {df.index}\n'
      f'df.columns\t = {df.columns}\n\n'
      f'df.values =\n{df.values}\n\n'
      f'type(df.values)\t= {type(df.values)}\n'
      f'df.ndim\t\t= {df.ndim}\n'
      f'df.shape\t= {df.shape}')

df :
    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4
j  Nevada  2002   2.9
k  Nevada  2003   3.2

df.index	 = Index(['d', 'f', 'g', 'h', 'j', 'k'], dtype='object')
df.columns	 = Index(['state', 'year', 'popu'], dtype='object')

df.values =
[['Ohio' 2000 1.5]
 ['Ohio' 2001 1.7]
 ['Ohio' 2002 3.6]
 ['Nevada' 2001 2.4]
 ['Nevada' 2002 2.9]
 ['Nevada' 2003 3.2]]

type(df.values)	= <class 'numpy.ndarray'>
df.ndim		= 2
df.shape	= (6, 3)


## df observations

In [72]:
print(f'df :\n{df}\n\n'
      f'df.describe():\n{df.describe()} end\n\n' # not including state because its datatype is object
      f'df.info:\n{df.info()}\n\n'
      f'df.head(4):\n{df.head(4)}\n\n'
      f'df.tail(4):\n{df.tail(4)}\n')

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, d to k
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   state   6 non-null      object 
 1   year    6 non-null      int64  
 2   popu    6 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 364.0+ bytes
df :
    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4
j  Nevada  2002   2.9
k  Nevada  2003   3.2

df.describe():
              year      popu
count     6.000000  6.000000
mean   2001.500000  2.550000
std       1.048809  0.836062
min    2000.000000  1.500000
25%    2001.000000  1.875000
50%    2001.500000  2.650000
75%    2002.000000  3.125000
max    2003.000000  3.600000 end

df.info:
None

df.head(4):
    state  year  popu
d    Ohio  2000   1.5
f    Ohio  2001   1.7
g    Ohio  2002   3.6
h  Nevada  2001   2.4

df.tail(4):
    state  year  popu
g    Ohio  2002   3.6
h  Nevada  2001   

## columns edit

In [73]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002, 2003],
        'popu': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

pd.DataFrame(data, columns = ['years', 'state', 'popu', 'nothing']) # add a new column, "nothing"

Unnamed: 0,years,state,popu,nothing
0,,Ohio,1.5,
1,,Ohio,1.7,
2,,Ohio,3.6,
3,,Nevada,2.4,
4,,Nevada,2.9,
5,,Nevada,3.2,


## df = Series + Series

In [76]:
area = pd.Series({'California': 423967, 'Texas' : 695662, 
                    'New York' : 141297, 'Florida' : 170312,
                    'Illinois' : 149995})
popu = pd.Series({'California': 38332521, 'Texas' : 26448193, 
                    'New York' : 19651127, 'Florida' : 19552860,
                    'Illinois' : 12882135})
data = pd.DataFrame({'area': area, 'popu': popu})

print(f'data :\n{data}\n\n'
      f'data.area :\n{data.area}\n\n'
      f"data['area'] :\n{data['area']}\n\n"
      f'data.popu: \n{data.popu}\n\n'
      f"data['popu'] :\n{data['popu']}")

data :
              area      popu
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135

data.area :
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

data['area'] :
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

data.popu: 
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: popu, dtype: int64

data['popu'] :
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: popu, dtype: int64


In [77]:
# add a new column utilizing existing columns
data['density'] = data['popu']/data['area']
# or
data.density = data.popu / data.area
data

Unnamed: 0,area,popu,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


## DataFrame can rename index and column name
(Series can't)

In [82]:
data1 = data.rename(columns={'area' : 'Col_1'}, index={'California': 'Row_1'})
print(f'data :\n{data}\n\n'
      f'data rename data1 =\n{data1}')

data :
              area      popu     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763

data rename data1 =
           Col_1      popu     density
Row_1     423967  38332521   90.413926
Texas     695662  26448193   38.018740
New York  141297  19651127  139.076746
Florida   170312  19552860  114.806121
Illinois  149995  12882135   85.883763
