### Pandas
- https://pandas.pydata.org/
- Series
    - index, value
- DataFrame
    - index, column, value

In [1]:
import numpy as np
import pandas as pd

##### Series
- index and value


- pandas --> form of table

```
      | value|      |     |   
------------------------------
index |      |      |     |      
sample|      |      |     |      
------------------------------
      |      |      |     |
```

In [2]:
# Create series by creating random integers from 0 to 9
# index are labeled 0~5

# index = row = 1 sample
# size = 5 --> n of values
# left : index
# right : value


data = pd.Series(np.random.randint(10, size=5))
data

0    8
1    6
2    6
3    5
4    0
dtype: int64

In [3]:
# Set index by input keyword parameters
data = pd.Series(np.random.randint(10, size=5), index=['a', 'b', 'c', 'd', 'e'])
data

a    0
b    6
c    1
d    0
e    7
dtype: int64

In [4]:
# Show index and values of Series
data.index, data.values

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'), array([0, 6, 1, 0, 7]))

In [5]:
# View value by of each index
data.c, data.c

(1, 1)

In [6]:
# Label each index in the seriesSeries
data.name = "Ran_num"
data.index.name = "Alphabet"
data.name, data.index.name

('Ran_num', 'Alphabet')

In [7]:
data

Alphabet
a    0
b    6
c    1
d    0
e    7
Name: Ran_num, dtype: int64

In [8]:
# calculate value (multiplication)
data * 100

Alphabet
a      0
b    600
c    100
d      0
e    700
Name: Ran_num, dtype: int64

In [9]:
# Return various data
data[["b","c","e"]]

Alphabet
b    6
c    1
e    7
Name: Ran_num, dtype: int64

In [10]:
# Return through offset
data[1:3]

Alphabet
b    6
c    1
Name: Ran_num, dtype: int64

In [11]:
data[:-1]

Alphabet
a    0
b    6
c    1
d    0
Name: Ran_num, dtype: int64

In [12]:
# Return boolean by comparison
data > 3

Alphabet
a    False
b     True
c    False
d    False
e     True
Name: Ran_num, dtype: bool

In [13]:
data[data > 3] # only leave True data (filtered)

Alphabet
b    6
e    7
Name: Ran_num, dtype: int64

In [14]:
# print index and value in for loop
# note that "items" is different from "items" in python or numpy. 
for idx, val in data.items():
    print(idx, val)

a 0
b 6
c 1
d 0
e 7


In [15]:
# create Series by dictionary type data

dic = {"d":7, "e":5, "f":9}
data2 = pd.Series(dic)
data2

d    7
e    5
f    9
dtype: int64

In [16]:
# calculate with series
result = data + data2
result

# return null(NaN) if not applicable

a     NaN
b     NaN
c     NaN
d     7.0
e    12.0
f     NaN
dtype: float64

In [17]:
# remove null data
print(result.notnull())
result = result[result.notnull()]
result

a    False
b    False
c    False
d     True
e     True
f    False
dtype: bool


d     7.0
e    12.0
dtype: float64

##### Dataframe
- consists of index, column, value
- create
- insert
    - rows
    - columns
- append
- concat
- groupby, aggregate
- select
- merge
- pivot

In [18]:
# create 1
# create DataFrame : (1) create column -> (2) add list data to column
# number of columns and data must match.
df = pd.DataFrame(columns=["Email", "Name"])
df["Name"] = ["fcamp", "dss"]
df["Email"] = ["fcamp@gmail.com", "dss@gmailcom"]
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmailcom,dss


In [19]:
# create 2
# create DataFrame : (1) add data to DataFrame by dictionary type data
name = ["fcamp", "dss"]
email = ["fcamp@gmail.com", "dss@gmail.com"]
df2 = pd.DataFrame({"Name":name, "Email":email})
df2

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [20]:
# create 3
# create DataFrame : (1) label index, (2) add index and data
index = ["first", "second"]
data = {"Email": ["fcamp@gmail.com", "dss@gmail.com"], "Name": ["fcamp", "dss"]}
df = pd.DataFrame(data, index=index)
df

Unnamed: 0,Email,Name
first,fcamp@gmail.com,fcamp
second,dss@gmail.com,dss


In [21]:
# return index, column, value data of DataFrame
df.index, df.columns, df.values

(Index(['first', 'second'], dtype='object'),
 Index(['Email', 'Name'], dtype='object'),
 array([['fcamp@gmail.com', 'fcamp'],
        ['dss@gmail.com', 'dss']], dtype=object))

In [22]:
df2.index, df2.columns, df2.values

(RangeIndex(start=0, stop=2, step=1),
 Index(['Email', 'Name'], dtype='object'),
 array([['fcamp@gmail.com', 'fcamp'],
        ['dss@gmail.com', 'dss']], dtype=object))

reaching data by rows

In [23]:
df2.loc[1]

Email    dss@gmail.com
Name               dss
Name: 1, dtype: object

In [24]:
# row의 수
len(df)

2

reaching data by columns

In [25]:
df["Email"]

first     fcamp@gmail.com
second      dss@gmail.com
Name: Email, dtype: object

##### Insert rows

In [26]:
# create DataFrame (Dictionary type)
data = {"Email": ["fcamp@gmail.com", "dss@gmail.com"], "Name": ["fcamp", "dss"]}
df = pd.DataFrame(data)

df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [27]:
# assign certian row and add/modify data
# loc[n]?????
df.loc[2] = {"Email": "data@gmail.com", "Name": "data"}
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data


In [28]:
# add data to the very last row
df.loc[len(df)] = {"Email":"science@gmail.com", "Name":"science"}
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data
3,science@gmail.com,science


##### Insert columns

In [29]:
# add column
df["Address"] = ""  # broadcasting is applied for adding "" blanks
df

Unnamed: 0,Email,Name,Address
0,fcamp@gmail.com,fcamp,
1,dss@gmail.com,dss,
2,data@gmail.com,data,
3,science@gmail.com,science,


In [30]:
# add columns with data
df["Country"] = ["KOR", "USA", "JPN", "GER"]
df

Unnamed: 0,Email,Name,Address,Country
0,fcamp@gmail.com,fcamp,,KOR
1,dss@gmail.com,dss,,USA
2,data@gmail.com,data,,JPN
3,science@gmail.com,science,,GER


In [31]:
df["Address"] = ["Seoul", "New York", "Tokyo", "Berlin"]
df

Unnamed: 0,Email,Name,Address,Country
0,fcamp@gmail.com,fcamp,Seoul,KOR
1,dss@gmail.com,dss,New York,USA
2,data@gmail.com,data,Tokyo,JPN
3,science@gmail.com,science,Berlin,GER


In [32]:
# apply : add data in columns with functions and lambda by using "apply"

def name(name):
    
    return "{}({})".format(name, len(name))

In [33]:
# apply has similar idea with "map", adding the data to each rows
df["New_Name"] = df["Name"].apply(name)
df

Unnamed: 0,Email,Name,Address,Country,New_Name
0,fcamp@gmail.com,fcamp,Seoul,KOR,fcamp(5)
1,dss@gmail.com,dss,New York,USA,dss(3)
2,data@gmail.com,data,Tokyo,JPN,data(4)
3,science@gmail.com,science,Berlin,GER,science(7)


In [34]:
# using lambda
df["New_Address"] = df["Address"].apply(lambda addr: "{}({})".format(addr, len(addr)))
df

Unnamed: 0,Email,Name,Address,Country,New_Name,New_Address
0,fcamp@gmail.com,fcamp,Seoul,KOR,fcamp(5),Seoul(5)
1,dss@gmail.com,dss,New York,USA,dss(3),New York(8)
2,data@gmail.com,data,Tokyo,JPN,data(4),Tokyo(5)
3,science@gmail.com,science,Berlin,GER,science(7),Berlin(6)


In [35]:
# apply is useful for preprocessing step: 
# for re-organizing data into a new column

# ex) encoding male/female data into 0 and 1
# es) encoding score into grade(A~F)

##### append

In [36]:
# make data functions of 'names' and 'age'

import random, string

def get_name():
    names = ['Adam', 'Alan', 'Alex', 'Alvin', 'Andrew', 'Anthony', 'Arnold', 'Baldy', 'Baron', 'Billy', 'Boris', 'Bruno', 'Caley', 'Champ', 'Charlie', 'Clark']
    return random.choice(names)

def get_age(start=20, end=40):
    return random.randint(start, end)

def make_data(rows=10):
    datas = []
    for _ in range(rows):
        data = {"Age":get_age(), "Name":get_name()}
        datas.append(data)
    return datas

    

In [37]:
data1 = make_data()
df1 = pd.DataFrame(data1)
df1

Unnamed: 0,Age,Name
0,37,Alvin
1,20,Clark
2,36,Arnold
3,31,Caley
4,30,Boris
5,21,Alan
6,32,Andrew
7,37,Arnold
8,38,Alan
9,38,Bruno


In [38]:
data2 = make_data()
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,Age,Name
0,35,Alex
1,39,Baron
2,38,Adam
3,39,Anthony
4,35,Anthony
5,29,Charlie
6,36,Alex
7,29,Boris
8,25,Charlie
9,31,Boris


In [39]:
# append
# add df2 to df1
df3 = df1.append(df2)
df3

Unnamed: 0,Age,Name
0,37,Alvin
1,20,Clark
2,36,Arnold
3,31,Caley
4,30,Boris
5,21,Alan
6,32,Andrew
7,37,Arnold
8,38,Alan
9,38,Bruno


In [40]:
# reset_index : reset index starting from 0
# inplace=True : modification form applies directly to the variable
# IF inplace=True is not applied, then assign the modified data to the variable again
# drop=True : drop the existing index and apply the new index
# IF drop=False, the existing index merges into column, and the new index is defined
df3.reset_index(drop=False, inplace=True)
df3

Unnamed: 0,index,Age,Name
0,0,37,Alvin
1,1,20,Clark
2,2,36,Arnold
3,3,31,Caley
4,4,30,Boris
5,5,21,Alan
6,6,32,Andrew
7,7,37,Arnold
8,8,38,Alan
9,9,38,Bruno


In [41]:
# IF inplace = False, receive the data and reassign it to the variable

# df3 = df3.reset_index(drop=True, inplace=True)
# df3

In [42]:
df3.reset_index(drop=True, inplace=True)
df3

Unnamed: 0,index,Age,Name
0,0,37,Alvin
1,1,20,Clark
2,2,36,Arnold
3,3,31,Caley
4,4,30,Boris
5,5,21,Alan
6,6,32,Andrew
7,7,37,Arnold
8,8,38,Alan
9,9,38,Bruno


##### concat
- rows
- columns

In [43]:
# concat rows
# concatenate df1 and df2 by rows
# rearrange index by using reset_index
# pd.concat([x, y])
df3 = pd.concat([df1, df2]).reset_index(drop=True)
df3

Unnamed: 0,Age,Name
0,37,Alvin
1,20,Clark
2,36,Arnold
3,31,Caley
4,30,Boris
5,21,Alan
6,32,Andrew
7,37,Arnold
8,38,Alan
9,38,Bruno


In [44]:
# concat columns
# axis=1 : allows concat in horizontal direction
df4 = pd.concat([df2, df1], axis=1)
df4

Unnamed: 0,Age,Name,Age.1,Name.1
0,35,Alex,37,Alvin
1,39,Baron,20,Clark
2,38,Adam,36,Arnold
3,39,Anthony,31,Caley
4,35,Anthony,30,Boris
5,29,Charlie,21,Alan
6,36,Alex,32,Andrew
7,29,Boris,37,Arnold
8,25,Charlie,38,Alan
9,31,Boris,38,Bruno


In [45]:
df4 = pd.concat([df3, df1], axis=1)
df4

Unnamed: 0,Age,Name,Age.1,Name.1
0,37,Alvin,37.0,Alvin
1,20,Clark,20.0,Clark
2,36,Arnold,36.0,Arnold
3,31,Caley,31.0,Caley
4,30,Boris,30.0,Boris
5,21,Alan,21.0,Alan
6,32,Andrew,32.0,Andrew
7,37,Arnold,37.0,Arnold
8,38,Alan,38.0,Alan
9,38,Bruno,38.0,Bruno


In [46]:
# join='inner' : drop all rows which contains null data
df5 = pd.concat([df3, df1], axis=1, join='inner')
df5

Unnamed: 0,Age,Name,Age.1,Name.1
0,37,Alvin,37,Alvin
1,20,Clark,20,Clark
2,36,Arnold,36,Arnold
3,31,Caley,31,Caley
4,30,Boris,30,Boris
5,21,Alan,21,Alan
6,32,Andrew,32,Andrew
7,37,Arnold,37,Arnold
8,38,Alan,38,Alan
9,38,Bruno,38,Bruno


##### Groupby
- creating dataframe showing avg age per each name

In [47]:
# make DataFrame
g_df = pd.DataFrame(make_data(20))
g_df.tail() # --> only shows 5 data from under, <--> .head

Unnamed: 0,Age,Name
15,40,Arnold
16,31,Anthony
17,40,Baldy
18,39,Anthony
19,38,Baron


In [48]:
g_df

Unnamed: 0,Age,Name
0,37,Caley
1,38,Arnold
2,35,Adam
3,38,Billy
4,32,Charlie
5,23,Charlie
6,20,Adam
7,35,Baron
8,35,Anthony
9,32,Anthony


In [49]:
# unique name list 1
# print unique name using 'set'
result1 = np.array(list(set(g_df["Name"].values)))
len(result1), result1

(11, array(['Alvin', 'Charlie', 'Champ', 'Bruno', 'Baron', 'Adam', 'Billy',
        'Arnold', 'Anthony', 'Baldy', 'Caley'],
       dtype='<U7'))

In [50]:
# unique name list 2 [recommended way]
# print unique name using 'unique' function
result2 = g_df["Name"].unique()
len(result2), result2

(11, array(['Caley', 'Arnold', 'Adam', 'Billy', 'Charlie', 'Baron', 'Anthony',
        'Champ', 'Bruno', 'Alvin', 'Baldy'], dtype=object))

**groupby**

In [51]:
# groupby (size)
# use 'groupby' to create dataframe counting the names on counts column.

In [52]:
# groupby Name column data into 'size' and reset the index
result_df = g_df.groupby("Name").size().reset_index(name='counts') 
result_df

Unnamed: 0,Name,counts
0,Adam,2
1,Alvin,1
2,Anthony,4
3,Arnold,3
4,Baldy,1
5,Baron,2
6,Billy,1
7,Bruno,1
8,Caley,2
9,Champ,1


In [53]:
# use 'sort_values' to sort descending. (ascending = True, ascending =False)
result_df = result_df.sort_values(by=['counts'], ascending=False)
result_df

Unnamed: 0,Name,counts
2,Anthony,4
3,Arnold,3
0,Adam,2
5,Baron,2
8,Caley,2
10,Charlie,2
1,Alvin,1
4,Baldy,1
6,Billy,1
7,Bruno,1


In [54]:
# reset index
result_df = result_df.reset_index(drop=True)
result_df

Unnamed: 0,Name,counts
0,Anthony,4
1,Arnold,3
2,Adam,2
3,Baron,2
4,Caley,2
5,Charlie,2
6,Alvin,1
7,Baldy,1
8,Billy,1
9,Bruno,1


**agregate**

In [55]:
# groupby (agg : min)
# groupby name with min age, and reset_index
g_df.groupby("Name").agg('min').reset_index()

Unnamed: 0,Name,Age
0,Adam,20
1,Alvin,22
2,Anthony,31
3,Arnold,29
4,Baldy,40
5,Baron,35
6,Billy,38
7,Bruno,30
8,Caley,31
9,Champ,31


In [56]:
g_df.groupby("Name").agg('min').reset_index().reset_index(drop=True)

Unnamed: 0,Name,Age
0,Adam,20
1,Alvin,22
2,Anthony,31
3,Arnold,29
4,Baldy,40
5,Baron,35
6,Billy,38
7,Bruno,30
8,Caley,31
9,Champ,31


In [57]:
g_df.groupby("Name").agg('max').reset_index().reset_index(drop=True)

Unnamed: 0,Name,Age
0,Adam,35
1,Alvin,22
2,Anthony,39
3,Arnold,40
4,Baldy,40
5,Baron,38
6,Billy,38
7,Bruno,30
8,Caley,37
9,Champ,31


In [58]:
# return min, max, avg age
g_df.groupby("Name").agg(["min","max","mean"]).reset_index()

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,20,35,27.5
1,Alvin,22,22,22.0
2,Anthony,31,39,34.25
3,Arnold,29,40,35.666667
4,Baldy,40,40,40.0
5,Baron,35,38,36.5
6,Billy,38,38,38.0
7,Bruno,30,30,30.0
8,Caley,31,37,34.0
9,Champ,31,31,31.0


##### select

In [59]:
df = _

# receive the value from above

In [60]:
df.head()   # default = 5

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,20,35,27.5
1,Alvin,22,22,22.0
2,Anthony,31,39,34.25
3,Arnold,29,40,35.666667
4,Baldy,40,40,40.0


In [61]:
# return 3 data from below
df.tail(3)

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
8,Caley,31,37,34.0
9,Champ,31,31,31.0
10,Charlie,23,32,27.5


In [62]:
# return data of rwo 3~5
df[3:5+1]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Arnold,29,40,35.666667
4,Baldy,40,40,40.0
5,Baron,35,38,36.5


In [63]:
# 3~끝 데이터 출력
df[3:]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Arnold,29,40,35.666667
4,Baldy,40,40,40.0
5,Baron,35,38,36.5
6,Billy,38,38,38.0
7,Bruno,30,30,30.0
8,Caley,31,37,34.0
9,Champ,31,31,31.0
10,Charlie,23,32,27.5


In [64]:
df[:3+1]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Adam,20,35,27.5
1,Alvin,22,22,22.0
2,Anthony,31,39,34.25
3,Arnold,29,40,35.666667


In [65]:
# reverse
df[::-1]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
10,Charlie,23,32,27.5
9,Champ,31,31,31.0
8,Caley,31,37,34.0
7,Bruno,30,30,30.0
6,Billy,38,38,38.0
5,Baron,35,38,36.5
4,Baldy,40,40,40.0
3,Arnold,29,40,35.666667
2,Anthony,31,39,34.25
1,Alvin,22,22,22.0


In [66]:

df.loc[2]["Age"]["min"], df.loc[3]["Name"][""]

(31, 'Arnold')

In [67]:
# Create datafrom from dictionary format
data = {
    "Name":df["Name"],
    "Min":df["Age"]["min"],
    "Max":df["Age"]["max"],
    "Mean":df["Age"]["mean"],
    }
n_df = pd.DataFrame(data)
n_df

# Why column'Name' is at the last column?
# column data is arranged by name in alphabetical order

Unnamed: 0,Max,Mean,Min,Name
0,35,27.5,20,Adam
1,22,22.0,22,Alvin
2,39,34.25,31,Anthony
3,40,35.666667,29,Arnold
4,40,40.0,40,Baldy
5,38,36.5,35,Baron
6,38,38.0,38,Billy
7,30,30.0,30,Bruno
8,37,34.0,31,Caley
9,31,31.0,31,Champ


In [68]:
# filter avg age above 30yrs, and sort the value in descending ordrer.
n_df[n_df["Mean"] > 30].sort_values(by=["Mean"], ascending=False).reset_index(drop=True)

Unnamed: 0,Max,Mean,Min,Name
0,40,40.0,40,Baldy
1,38,38.0,38,Billy
2,38,36.5,35,Baron
3,40,35.666667,29,Arnold
4,39,34.25,31,Anthony
5,37,34.0,31,Caley
6,31,31.0,31,Champ


In [69]:
# Add column showing count of each name
n_df["Count"] = list(g_df.groupby("Name").size())
n_df

Unnamed: 0,Max,Mean,Min,Name,Count
0,35,27.5,20,Adam,2
1,22,22.0,22,Alvin,1
2,39,34.25,31,Anthony,4
3,40,35.666667,29,Arnold,3
4,40,40.0,40,Baldy,1
5,38,36.5,35,Baron,2
6,38,38.0,38,Billy,1
7,30,30.0,30,Bruno,1
8,37,34.0,31,Caley,2
9,31,31.0,31,Champ,1


In [70]:
# Push Mean data to the end
mean = n_df["Mean"] # save Mean data
n_df.drop('Mean', axis=1, inplace=True) # remove Mean data
n_df["Mean"] = mean # add Mean data
n_df

Unnamed: 0,Max,Min,Name,Count,Mean
0,35,20,Adam,2,27.5
1,22,22,Alvin,1,22.0
2,39,31,Anthony,4,34.25
3,40,29,Arnold,3,35.666667
4,40,40,Baldy,1,40.0
5,38,35,Baron,2,36.5
6,38,38,Billy,1,38.0
7,30,30,Bruno,1,30.0
8,37,31,Caley,2,34.0
9,31,31,Champ,1,31.0


In [71]:
# rename column
n_df.rename(columns={"Name":"Unique_NAme"})

Unnamed: 0,Max,Min,Unique_NAme,Count,Mean
0,35,20,Adam,2,27.5
1,22,22,Alvin,1,22.0
2,39,31,Anthony,4,34.25
3,40,29,Arnold,3,35.666667
4,40,40,Baldy,1,40.0
5,38,35,Baron,2,36.5
6,38,38,Billy,1,38.0
7,30,30,Bruno,1,30.0
8,37,31,Caley,2,34.0
9,31,31,Champ,1,31.0


##### Merge
- Dataframe(1) of UsderID, Name, Age data
- Dataframe(2) of ID, Money

In [87]:
# create Dataframe(1), name data must be unique
user_df = pd.DataFrame(columns=["UserID", "Name", "Age"])

for idx in range(1,10+1):
    name = get_name()
    
    # if overlapping, pick name again.
    while name in list(user_df["Name"]):
        name = get_name()
    
    # insert data fo name_df insert
    data = {"Name":name, "UserID":idx, "Age":get_age()}
    user_df.loc[len(user_df)] = data # insert data to the last row of df
        
user_df

Unnamed: 0,UserID,Name,Age
0,1,Alex,25
1,2,Andrew,31
2,3,Alan,21
3,4,Caley,20
4,5,Bruno,29
5,6,Arnold,22
6,7,Clark,29
7,8,Adam,27
8,9,Boris,30
9,10,Anthony,26


In [73]:
# create Dataframe(2)
money_df = pd.DataFrame(columns=["ID","Money"])

for idx in range(15):        
    money = random.randint(1,20) * 1000
    data = {"Money":money, "ID":random.randint(1,10)}
    money_df.loc[len(money_df)] = data
    
# money_df.sort_values("ID", inplace=True)    
money_df

Unnamed: 0,ID,Money
0,7,2000
1,10,12000
2,10,17000
3,8,15000
4,5,15000
5,7,5000
6,10,20000
7,3,13000
8,2,7000
9,1,13000


In [74]:
# match ID column of money_df, UserID column of user_df and merge
# UserID and ID match in data, but name is differente, therefore the data goes into two different columns
# left on: Key, right on: value
money_df.merge(user_df, left_on="ID", right_on="UserID")

Unnamed: 0,ID,Money,UserID,Name,Age
0,7,2000,7,Anthony,37
1,7,5000,7,Anthony,37
2,7,8000,7,Anthony,37
3,10,12000,10,Bruno,25
4,10,17000,10,Bruno,25
5,10,20000,10,Bruno,25
6,8,15000,8,Clark,34
7,8,16000,8,Clark,34
8,8,20000,8,Clark,34
9,5,15000,5,Caley,25


In [75]:
# change UserID to ID and merge
# no need to specify left_on, right_on
user_df.rename(columns={"UserID":"ID"}, inplace=True)
user_df

Unnamed: 0,ID,Name,Age
0,1,Adam,30
1,2,Alvin,32
2,3,Baron,31
3,4,Arnold,34
4,5,Caley,25
5,6,Alan,33
6,7,Anthony,37
7,8,Clark,34
8,9,Baldy,23
9,10,Bruno,25


In [76]:
result_df = pd.merge(money_df, user_df)
result_df

Unnamed: 0,ID,Money,Name,Age
0,7,2000,Anthony,37
1,7,5000,Anthony,37
2,7,8000,Anthony,37
3,10,12000,Bruno,25
4,10,17000,Bruno,25
5,10,20000,Bruno,25
6,8,15000,Clark,34
7,8,16000,Clark,34
8,8,20000,Clark,34
9,5,15000,Caley,25


In [77]:
# groupby Money data with Name and sum all
money_list = result_df.groupby("Name").sum()["Money"].reset_index()
money_list

Unnamed: 0,Name,Money
0,Adam,13000
1,Alvin,7000
2,Anthony,15000
3,Arnold,4000
4,Baron,29000
5,Bruno,49000
6,Caley,15000
7,Clark,51000


In [78]:
# sort the Money in descending
money_list = money_list.sort_values(by=['Money'], ascending=False).reset_index(drop=True)
money_list

Unnamed: 0,Name,Money
0,Clark,51000
1,Bruno,49000
2,Baron,29000
3,Anthony,15000
4,Caley,15000
5,Adam,13000
6,Alvin,7000
7,Arnold,4000


In [79]:
# apply how='outer' and the null data is return '0'
result = pd.merge(user_df, money_list, how='outer')
result

Unnamed: 0,ID,Name,Age,Money
0,1,Adam,30,13000.0
1,2,Alvin,32,7000.0
2,3,Baron,31,29000.0
3,4,Arnold,34,4000.0
4,5,Caley,25,15000.0
5,6,Alan,33,
6,7,Anthony,37,15000.0
7,8,Clark,34,51000.0
8,9,Baldy,23,
9,10,Bruno,25,49000.0


In [80]:
# fillna(value=0) : fill null data with 0
result = pd.merge(user_df, money_list, how='outer').fillna(value=0)
result

Unnamed: 0,ID,Name,Age,Money
0,1,Adam,30,13000.0
1,2,Alvin,32,7000.0
2,3,Baron,31,29000.0
3,4,Arnold,34,4000.0
4,5,Caley,25,15000.0
5,6,Alan,33,0.0
6,7,Anthony,37,15000.0
7,8,Clark,34,51000.0
8,9,Baldy,23,0.0
9,10,Bruno,25,49000.0


In [81]:
result= result.sort_values(by=['Money'], ascending=False).reset_index(drop=True)
result

Unnamed: 0,ID,Name,Age,Money
0,8,Clark,34,51000.0
1,10,Bruno,25,49000.0
2,3,Baron,31,29000.0
3,5,Caley,25,15000.0
4,7,Anthony,37,15000.0
5,1,Adam,30,13000.0
6,2,Alvin,32,7000.0
7,4,Arnold,34,4000.0
8,6,Alan,33,0.0
9,9,Baldy,23,0.0


In [82]:
# change data type
# change float data (Money column) into int form
result["Money"] = result["Money"].astype("int")
result

Unnamed: 0,ID,Name,Age,Money
0,8,Clark,34,51000
1,10,Bruno,25,49000
2,3,Baron,31,29000
3,5,Caley,25,15000
4,7,Anthony,37,15000
5,1,Adam,30,13000
6,2,Alvin,32,7000
7,4,Arnold,34,4000
8,6,Alan,33,0
9,9,Baldy,23,0


pop quiz

In [83]:
# Name : a, b, c, d, e
# Point : random 60 ~ 100
# create DataFrame

In [84]:
df = pd.DataFrame(columns=["Name", "Points"])
df["Name"]=["A", "B", "C", "D", "E", "F", "G"]
df["Points"] = np.random.randint(40, 100, size=7)
df

Unnamed: 0,Name,Points
0,A,79
1,B,93
2,C,45
3,D,46
4,E,98
5,F,58
6,G,41


In [85]:
def create_grade(point):
    if point >= 90:
        return "A"
    elif point >= 80:
        return "B"
    elif point >= 70:
        return "C"
    elif point >= 60:
        return "D"
    else:
        return "F"
    
df["Result"] = df["Points"].apply(create_grade)
df

Unnamed: 0,Name,Points,Result
0,A,79,C
1,B,93,A
2,C,45,F
3,D,46,F
4,E,98,A
5,F,58,F
6,G,41,F
