
# What is Pandas?
Pandas is a python library that can be used to analyze and manipulate data.

Extensive support libraries (NumPy for numerical calculations, Pandas for data analytics, etc.)
## Import pandas library

In [1]:
import pandas as pd

## Series
A one-dimentional array capable of holding any data type. It is like a column in a database table

In [2]:
pd.Series(['Bangkok', 'Krabi', 'Pattaya'])

0    Bangkok
1      Krabi
2    Pattaya
dtype: object

## DataFrame
A two-dimentional data structure with data and label or rows and columns.

In [3]:
cities = pd.Series(['Bangkok', 'Krabi', 'Pattaya'])
population = pd.Series([11234000, 32644, 119532])

th_data = pd.DataFrame({ 'City': cities, 'Population': population })

### Accessing DataFrame data
Similar to dictionaries, you can access data by adding key.

In [4]:
print('Get all data with City key')
print(th_data["City"])
print('Get data from City key with index of 1')
print(th_data["City"][1])

Get all data with City key
0    Bangkok
1      Krabi
2    Pattaya
Name: City, dtype: object
Get data from City key with index of 1
Krabi


## Read CSV files
CSV also known as comma separated files. It is file that has tabular format.

In [5]:
df = pd.read_csv('src/th_data.csv')
print(df)

                city      lat       lng   country iso2  \
0            Bangkok  13.7525  100.4942  Thailand   TH   
1         Chiang Mai  18.7953   98.9986  Thailand   TH   
2         Nonthaburi  13.8667  100.5167  Thailand   TH   
3           Pak Kret  13.9125  100.4978  Thailand   TH   
4    Ban Bang Pu Mai  13.5441  100.6175  Thailand   TH   
5        Ban Mangkon  13.6138  100.6104  Thailand   TH   
6  Ban Talat Rangsit  13.9833  100.6167  Thailand   TH   
7             Phuket   7.8881   98.3975  Thailand   TH   
8       Samut Sakhon  13.5486  100.2775  Thailand   TH   

               admin_name  capital  population  population_proper  
0  Krung Thep Maha Nakhon  primary    18007000            8305218  
1              Chiang Mai    admin     1198000            1198000  
2              Nonthaburi    admin      255671             255671  
3              Nonthaburi    minor      184501             184501  
4            Samut Prakan      NaN      120058             120058  
5          

### Access DataFrame Values
To access DataFrame values, we use iterrows() function to iterate rows in the DataFrame.

In [6]:
for index, row in df.iterrows():
    print(row['city'])

Bangkok
Chiang Mai
Nonthaburi
Pak Kret
Ban Bang Pu Mai
Ban Mangkon
Ban Talat Rangsit
Phuket
Samut Sakhon


### Renaming columns
To rename a column, pandas library uses rename() function. We use inplace=True to apply changes to the DataFrame.

In [7]:
df.rename(columns = {'city':'city_list'}, inplace = True)

print("After modifying city column: ", df.columns)

After modifying city column:  Index(['city_list', 'lat', 'lng', 'country', 'iso2', 'admin_name', 'capital',
       'population', 'population_proper'],
      dtype='object')


### Filter columns
Filtering is used for finding a specific value the user provided. There are multiple ways to filter. First is filter()

In [8]:
city_pop = df.filter(items=['city_list', 'population'])
print(city_pop)

           city_list  population
0            Bangkok    18007000
1         Chiang Mai     1198000
2         Nonthaburi      255671
3           Pak Kret      184501
4    Ban Bang Pu Mai      120058
5        Ban Mangkon      102291
6  Ban Talat Rangsit       79319
7             Phuket       77610
8       Samut Sakhon       60103


### Filter by column value

In [9]:
pop = df.loc[df['population'] == 867678.0]
print(pop)

Empty DataFrame
Columns: [city_list, lat, lng, country, iso2, admin_name, capital, population, population_proper]
Index: []


### Filter with logical conditions

In [10]:
pop_higher = df.loc[df['population'] > 3453198.0]
print(pop_higher)

  city_list      lat       lng   country iso2              admin_name  \
0   Bangkok  13.7525  100.4942  Thailand   TH  Krung Thep Maha Nakhon   

   capital  population  population_proper  
0  primary    18007000            8305218  


### Filter by missing values

In [11]:
not_null = df[df['population'].notnull()]
print(not_null)

           city_list      lat       lng   country iso2  \
0            Bangkok  13.7525  100.4942  Thailand   TH   
1         Chiang Mai  18.7953   98.9986  Thailand   TH   
2         Nonthaburi  13.8667  100.5167  Thailand   TH   
3           Pak Kret  13.9125  100.4978  Thailand   TH   
4    Ban Bang Pu Mai  13.5441  100.6175  Thailand   TH   
5        Ban Mangkon  13.6138  100.6104  Thailand   TH   
6  Ban Talat Rangsit  13.9833  100.6167  Thailand   TH   
7             Phuket   7.8881   98.3975  Thailand   TH   
8       Samut Sakhon  13.5486  100.2775  Thailand   TH   

               admin_name  capital  population  population_proper  
0  Krung Thep Maha Nakhon  primary    18007000            8305218  
1              Chiang Mai    admin     1198000            1198000  
2              Nonthaburi    admin      255671             255671  
3              Nonthaburi    minor      184501             184501  
4            Samut Prakan      NaN      120058             120058  
5          

### Operators in DataFrame

In [12]:
df['population_subtract'] = df['population'] - df['population_proper']
print(df)

           city_list      lat       lng   country iso2  \
0            Bangkok  13.7525  100.4942  Thailand   TH   
1         Chiang Mai  18.7953   98.9986  Thailand   TH   
2         Nonthaburi  13.8667  100.5167  Thailand   TH   
3           Pak Kret  13.9125  100.4978  Thailand   TH   
4    Ban Bang Pu Mai  13.5441  100.6175  Thailand   TH   
5        Ban Mangkon  13.6138  100.6104  Thailand   TH   
6  Ban Talat Rangsit  13.9833  100.6167  Thailand   TH   
7             Phuket   7.8881   98.3975  Thailand   TH   
8       Samut Sakhon  13.5486  100.2775  Thailand   TH   

               admin_name  capital  population  population_proper  \
0  Krung Thep Maha Nakhon  primary    18007000            8305218   
1              Chiang Mai    admin     1198000            1198000   
2              Nonthaburi    admin      255671             255671   
3              Nonthaburi    minor      184501             184501   
4            Samut Prakan      NaN      120058             120058   
5    

### Dropping column in DataFrame
drop() function is used to delete columns in a DataFrame

In [13]:
dropped = df.drop(['population_proper'], axis=1)
print(dropped)

           city_list      lat       lng   country iso2  \
0            Bangkok  13.7525  100.4942  Thailand   TH   
1         Chiang Mai  18.7953   98.9986  Thailand   TH   
2         Nonthaburi  13.8667  100.5167  Thailand   TH   
3           Pak Kret  13.9125  100.4978  Thailand   TH   
4    Ban Bang Pu Mai  13.5441  100.6175  Thailand   TH   
5        Ban Mangkon  13.6138  100.6104  Thailand   TH   
6  Ban Talat Rangsit  13.9833  100.6167  Thailand   TH   
7             Phuket   7.8881   98.3975  Thailand   TH   
8       Samut Sakhon  13.5486  100.2775  Thailand   TH   

               admin_name  capital  population  population_subtract  
0  Krung Thep Maha Nakhon  primary    18007000              9701782  
1              Chiang Mai    admin     1198000                    0  
2              Nonthaburi    admin      255671                    0  
3              Nonthaburi    minor      184501                    0  
4            Samut Prakan      NaN      120058                    0  

## Group by column name
It is used to group large amount of data and compute operations

In [14]:
admin_pop = df.groupby(by=["admin_name", "population"]).sum()
admin_pop

Unnamed: 0_level_0,Unnamed: 1_level_0,city_list,lat,lng,country,iso2,capital,population_proper,population_subtract
admin_name,population,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Chiang Mai,1198000,Chiang Mai,18.7953,98.9986,Thailand,TH,admin,1198000,0
Krung Thep Maha Nakhon,18007000,Bangkok,13.7525,100.4942,Thailand,TH,primary,8305218,9701782
Nonthaburi,184501,Pak Kret,13.9125,100.4978,Thailand,TH,minor,184501,0
Nonthaburi,255671,Nonthaburi,13.8667,100.5167,Thailand,TH,admin,255671,0
Pathum Thani,79319,Ban Talat Rangsit,13.9833,100.6167,Thailand,TH,0,79319,0
Phuket,77610,Phuket,7.8881,98.3975,Thailand,TH,admin,77610,0
Samut Prakan,102291,Ban Mangkon,13.6138,100.6104,Thailand,TH,0,102291,0
Samut Prakan,120058,Ban Bang Pu Mai,13.5441,100.6175,Thailand,TH,0,120058,0
Samut Sakhon,60103,Samut Sakhon,13.5486,100.2775,Thailand,TH,admin,60103,0


### Concatenate values in DataFrame
concat() function is used in pandas to concatenate or link data together

In [15]:
df1 = pd.DataFrame([['John', 25], ['Joe', 23]],
                   columns=['Name', 'Age'])
df2 = pd.DataFrame([['Mary', 20], ['Anna', 21]],
                   columns=['Name', 'Age'])

print(df1)
print(df2)
concat_val = pd.concat([df1, df2], ignore_index=True)
print(concat_val)

   Name  Age
0  John   25
1   Joe   23
   Name  Age
0  Mary   20
1  Anna   21
   Name  Age
0  John   25
1   Joe   23
2  Mary   20
3  Anna   21


### Merge values in DataFrame
merge() function is used in pandas to combine common columns in a DataFrame.
* inner
* right
* outer
* inner

In [16]:
df_1 = pd.DataFrame([['Johnny', 25], ['Jared', 23]],
                   columns=['Name', 'Age'])
df_2 = pd.DataFrame([['Johnny', 20], ['Jane', 21]],
                   columns=['Name', 'Age'])

print(df_1)
print(df_2)
inner = df_1.merge(df_2, on='Name', how='inner')
print('inner')
print(inner)
left = df_1.merge(df_2, on='Name', how='left')
print('left')
print(left)
right = df_1.merge(df_2, on='Name', how='right')
print('right')
print(right)
outer = df_1.merge(df_2, on='Name', how='outer')
print('outer')
print(outer)

     Name  Age
0  Johnny   25
1   Jared   23
     Name  Age
0  Johnny   20
1    Jane   21
inner
     Name  Age_x  Age_y
0  Johnny     25     20
left
     Name  Age_x  Age_y
0  Johnny     25   20.0
1   Jared     23    NaN
right
     Name  Age_x  Age_y
0  Johnny   25.0     20
1    Jane    NaN     21
outer
     Name  Age_x  Age_y
0  Johnny   25.0   20.0
1   Jared   23.0    NaN
2    Jane    NaN   21.0


### Join values in DataFrame
join() function is used in pandas to insert columns from another DataFrame.

In [17]:
outer_join = df_1.join(df_2, lsuffix="_left", rsuffix="_right", how='outer')
print('outer')
print(outer_join)

outer
  Name_left  Age_left Name_right  Age_right
0    Johnny        25     Johnny         20
1     Jared        23       Jane         21
