In [2]:
import pandas as pd

# Series are one-dimensional arrays (like R’s vectors)
# Create a series of the number of floodingReports
floodingReports = pd.Series([5, 6, 2, 9, 12])
print(); print(floodingReports)

# Set county names to be the index of the floodingReports series
floodingReports = pd.Series([5, 6, 2, 9, 12], index=['Cochise County', 'Pima County',
                            'Santa Cruz County', 'Maricopa County', 'Yuma County'])
print(); print(floodingReports)


0     5
1     6
2     2
3     9
4    12
dtype: int64

Cochise County        5
Pima County           6
Santa Cruz County     2
Maricopa County       9
Yuma County          12
dtype: int64


In [3]:
# View the number of floodingReports in Cochise County
print(); print(floodingReports['Cochise County'])


5


In [6]:
# View the counties with more than 6 flooding reports
print(); print(floodingReports[floodingReports >6])


Maricopa County     9
Yuma County        12
dtype: int64


In [13]:
# Create a pandas series from a dictionary
fireReports_dict = {'Cochise County': 12, 'Pima County': 342,
                    'Santa Cruz County': 13, 'Maricopa County': 42,
                    'Yuma County' : 52}

# Convert the dictionary into a pd.Series, and view it
fireReports = pd.Series(fireReports_dict)
print();print(fireReports)


Cochise County        12
Pima County          342
Santa Cruz County     13
Maricopa County       42
Yuma County           52
dtype: int64


In [14]:
# Change the index of a series to shorter names
fireReports.index = ["Cochice", "Pima", "Santa Cruz", "Maricopa", "Yuma"]

In [15]:
print();print(fireReports)


Cochice        12
Pima          342
Santa Cruz     13
Maricopa       42
Yuma           52
dtype: int64


In [24]:
# DataFrames are like R’s Dataframes
# Create a dataframe from a dict of equal length lists or numpy arrays
data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data)
print(); print(df)

# Set the order of the columns using the columns attribute
df = pd.DataFrame(data,columns=['reports','year','county'])
print(); print(df)



       county  year  reports
0     Cochice  2012        4
1        Pima  2012       24
2  Santa Cruz  2013       31
3    Maricopa  2014        2
4        Yuma  2014        3

   reports  year      county
0        4  2012     Cochice
1       24  2012        Pima
2       31  2013  Santa Cruz
3        2  2014    Maricopa
4        3  2014        Yuma


In [27]:
# Add a column
df['New Column'] = pd.Series([5,10,15,20,25])
print(); print(df)


   reports  year      county  New Column
0        4  2012     Cochice           5
1       24  2012        Pima          10
2       31  2013  Santa Cruz          15
3        2  2014    Maricopa          20
4        3  2014        Yuma          25


In [28]:
# Delete a column
del df['New Column']
print(); print(df)


   reports  year      county
0        4  2012     Cochice
1       24  2012        Pima
2       31  2013  Santa Cruz
3        2  2014    Maricopa
4        3  2014        Yuma


In [32]:
# Transpose the dataframe
print(); print(df.T)


               0     1           2         3     4
reports        4    24          31         2     3
year        2012  2012        2013      2014  2014
county   Cochice  Pima  Santa Cruz  Maricopa  Yuma


In [33]:
print(); print(df)


   reports  year      county
0        4  2012     Cochice
1       24  2012        Pima
2       31  2013  Santa Cruz
3        2  2014    Maricopa
4        3  2014        Yuma


In [36]:
## How to insert a new column based on condition in Python
# load libraries
import pandas as pd
import numpy as np
# Create an example dataframe
raw_data = {'student_name': ['Miller', 'Jacobson', 'Bali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
            'test_score': [76, 88, 84, 67, 53, 96, 64, 91, 77, 73, 52, np.NaN]}
df = pd.DataFrame(raw_data, columns = ['student_name', 'test_score'])
print(); print(df)


   student_name  test_score
0        Miller        76.0
1      Jacobson        88.0
2          Bali        84.0
3        Milner        67.0
4         Cooze        53.0
5         Jacon        96.0
6        Ryaner        64.0
7          Sone        91.0
8         Sloan        77.0
9         Piger        73.0
10        Riani        52.0
11          Ali         NaN


In [37]:
# Create a function to assign letter grades
grades = []
for row in df['test_score']:
    if row > 95:    grades.append('A')
    elif row > 90:  grades.append('A-')
    elif row > 85:  grades.append('B')
    elif row > 80:  grades.append('B-')
    elif row > 75:  grades.append('C')
    elif row > 70:  grades.append('C-')
    elif row > 65:  grades.append('D')
    elif row > 60:  grades.append('D-')
    else:           grades.append('Failed')
# Create a column from the list
df['grades'] = grades
print(); print(df)


   student_name  test_score  grades
0        Miller        76.0       C
1      Jacobson        88.0       B
2          Bali        84.0      B-
3        Milner        67.0       D
4         Cooze        53.0  Failed
5         Jacon        96.0       A
6        Ryaner        64.0      D-
7          Sone        91.0      A-
8         Sloan        77.0       C
9         Piger        73.0      C-
10        Riani        52.0  Failed
11          Ali         NaN  Failed


In [41]:
## How to sort rows within a Pandas DataFrame
# load libraries
import pandas as pd

# Create dataframe
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [1, 2, 1, 2, 3],
        'coverage': [2, 2, 3, 3, 3]}

df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
print(); print(df)

# Sort the dataframe’s rows by reports, in descending order
print(); print(df.sort_values(by = 'reports',ascending=False))


             name  year  reports  coverage
Cochice     Jason  2012        1         2
Pima        Molly  2012        2         2
Santa Cruz   Tina  2013        1         3
Maricopa     Jake  2014        2         3
Yuma          Amy  2014        3         3

             name  year  reports  coverage
Yuma          Amy  2014        3         3
Pima        Molly  2012        2         2
Maricopa     Jake  2014        2         3
Cochice     Jason  2012        1         2
Santa Cruz   Tina  2013        1         3


In [42]:
# Sort the dataframe’s rows by reports, in descending order
print();print(df.sort_values(by = 'reports',ascending=True))


             name  year  reports  coverage
Cochice     Jason  2012        1         2
Santa Cruz   Tina  2013        1         3
Pima        Molly  2012        2         2
Maricopa     Jake  2014        2         3
Yuma          Amy  2014        3         3


In [48]:
# Sort the dataframe’s rows by coverage and then by reports, in ascending order
print();print(df)

print();print(df.sort_values(by = ['coverage','reports'],ascending =True))


             name  year  reports  coverage
Cochice     Jason  2012        1         2
Pima        Molly  2012        2         2
Santa Cruz   Tina  2013        1         3
Maricopa     Jake  2014        2         3
Yuma          Amy  2014        3         3

             name  year  reports  coverage
Cochice     Jason  2012        1         2
Pima        Molly  2012        2         2
Santa Cruz   Tina  2013        1         3
Maricopa     Jake  2014        2         3
Yuma          Amy  2014        3         3


In [50]:
# load libraries
import pandas as pd

# Create dataframe
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}

df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
print(); print(df)

# Create a new column that is the rank of the value of coverage in ascending order
df['coverageRanked']=df['coverage'].rank(ascending=True)
print(); print(df)


             name  year  reports  coverage
Cochice     Jason  2012        4        25
Pima        Molly  2012       24        94
Santa Cruz   Tina  2013       31        57
Maricopa     Jake  2014        2        62
Yuma          Amy  2014        3        70


In [2]:
## How to JOIN and MERGE Pandas DataFrame
# load libraries
import pandas as pd

# Create a dataframe
raw_data = {'subject_id': ['1', '2', '3', '4', '5'],
            'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
            'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
print(); print(df_a)

# Create a second dataframe
raw_data = {'subject_id': ['4', '5', '6', '7', '8'],
            'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
            'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
print(); print(df_b)

# Create a third dataframe
raw_data = {'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
            'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
print(); print(df_n)



  subject_id first_name last_name
0          1       Alex  Anderson
1          2        Amy  Ackerman
2          3      Allen       Ali
3          4      Alice      Aoni
4          5     Ayoung   Atiches

  subject_id first_name last_name
0          4      Billy    Bonder
1          5      Brian     Black
2          6       Bran   Balwner
3          7      Bryce     Brice
4          8      Betty    Btisan

  subject_id  test_id
0          1       51
1          2       15
2          3       15
3          4       61
4          5       16
5          7       14
6          8       15
7          9        1
8         10       61
9         11       16


In [6]:
# Join the two dataframes along rows
df_new = pd.concat([df_a,df_b])
df_new

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [7]:
df_new1 = pd.concat([df_a,df_b],axis=1)
df_new1

Unnamed: 0,subject_id,first_name,last_name,subject_id.1,first_name.1,last_name.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner
3,4,Alice,Aoni,7,Bryce,Brice
4,5,Ayoung,Atiches,8,Betty,Btisan


In [10]:
# Merge two dataframes with both the left and right dataframes using the subject_id key
df_merge = pd.merge(df_new,df_n,left_on='subject_id',right_on='subject_id')
df_merge

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,4,Billy,Bonder,61
5,5,Ayoung,Atiches,16
6,5,Brian,Black,16
7,7,Bryce,Brice,14
8,8,Betty,Btisan,15


In [11]:
print(); print(df_n)


  subject_id  test_id
0          1       51
1          2       15
2          3       15
3          4       61
4          5       16
5          7       14
6          8       15
7          9        1
8         10       61
9         11       16


In [12]:
df_a

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches


In [13]:
df_b

Unnamed: 0,subject_id,first_name,last_name
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [14]:
# Merge with outer join
df_outer = pd.merge(df_a,df_b,on = 'subject_id',how='outer')
df_outer

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black
5,6,,,Bran,Balwner
6,7,,,Bryce,Brice
7,8,,,Betty,Btisan


In [15]:
# Merge with inner join
df_inner = pd.merge(df_a,df_b,on='subject_id',how='inner')
df_inner

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black


In [16]:
# Merge with left right
df_right = pd.merge(df_a,df_b,on='subject_id',how='right')
df_right

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black
2,6,,,Bran,Balwner
3,7,,,Bryce,Brice
4,8,,,Betty,Btisan


In [17]:
# Merge with left join
df_left = pd.merge(df_a,df_b,on='subject_id',how='left')
df_left

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black


In [19]:
 # Merge while adding a suffix to duplicate column names
df_suffix = pd.merge(df_a,df_b,suffixes=('_left','_right'),on = 'subject_id',how='left')
df_suffix

Unnamed: 0,subject_id,first_name_left,last_name_left,first_name_right,last_name_right
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black


# How to list unique values in a Pandas DataFrame?


In [21]:

## How to list unique values in a Pandas DataFrame

# load libraries
import pandas as pd

# Set ipython's max row display
pd.set_option('display.max_row', 1000)

# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)

# Create an example dataframe
data = {'name': ['Jason', 'Molly', 'Tina', 'Tina', 'Amy'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz',
                                 'Maricopa', 'Yuma'])
print(); print(df)

# List unique values in the df['name'] column
print(); print(df['name'].unique())


             name  year  reports
Cochice     Jason  2012        4
Pima        Molly  2012       24
Santa Cruz   Tina  2013       31
Maricopa     Tina  2014        2
Yuma          Amy  2014        3

['Jason' 'Molly' 'Tina' 'Amy']


# How to filter in a Pandas DataFrame?

In [23]:
# load libraries
import pandas as pd

# Create Dataframe
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}

df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
print(); print(df)

# View Column
print(); print(df['name'])


             name  year  reports  coverage
Cochice     Jason  2012        4        25
Pima        Molly  2012       24        94
Santa Cruz   Tina  2013       31        57
Maricopa     Jake  2014        2        62
Yuma          Amy  2014        3        70

Cochice       Jason
Pima          Molly
Santa Cruz     Tina
Maricopa       Jake
Yuma            Amy
Name: name, dtype: object


In [25]:
# View Two Columns
print(); print(df[['name','reports']])


             name  reports
Cochice     Jason        4
Pima        Molly       24
Santa Cruz   Tina       31
Maricopa     Jake        2
Yuma          Amy        3


In [26]:
# View First Two Rows
print(); print(df[:2])


          name  year  reports  coverage
Cochice  Jason  2012        4        25
Pima     Molly  2012       24        94


In [32]:
# View Rows Where Coverage Is Greater Than 50
print(); print(df[df['coverage']>50])


             name  year  reports  coverage
Pima        Molly  2012       24        94
Santa Cruz   Tina  2013       31        57
Maricopa     Jake  2014        2        62
Yuma          Amy  2014        3        70


In [43]:
# View Rows Where Coverage Is Greater Than 50 And Reports Less Than 4
print();print(df[(df['coverage']>50) & (df['reports']<4)])


          name  year  reports  coverage
Maricopa  Jake  2014        2        62
Yuma       Amy  2014        3        70


# How to delete duplicates from a Pandas DataFrame?


In [46]:
# Create dataframe with duplicates
raw_data = {'first_name': ['Jason', 'Jason', 'Jason','Tina', 'Jake', 'Amy'],
            'last_name': ['Miller', 'Miller', 'Miller','Ali', 'Milner', 'Cooze'],
            'age': [42, 42, 1111111, 36, 24, 73],
            'preTestScore': [4, 4, 4, 31, 2, 3],
            'postTestScore': [25, 25, 25, 57, 62, 70]}

df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age',
                                       'preTestScore', 'postTestScore'])
print(); print(df)

# Identify which observations are duplicates
print(); print(df[df.duplicated()])


  first_name last_name      age  preTestScore  postTestScore
0      Jason    Miller       42             4             25
1      Jason    Miller       42             4             25
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70

  first_name last_name  age  preTestScore  postTestScore
1      Jason    Miller   42             4             25


In [47]:
# Identify which observations are duplicates
print(); print(df.drop_duplicates(keep='first'))


  first_name last_name      age  preTestScore  postTestScore
0      Jason    Miller       42             4             25
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70


In [48]:
# Identify which observations are duplicates
print(); print(df.drop_duplicates(keep='last'))


  first_name last_name      age  preTestScore  postTestScore
1      Jason    Miller       42             4             25
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70


In [None]:
# Drop duplicates in the first name column, but take the last obs in the duplicated set

In [52]:
print();print(df)
print(); print(df.drop_duplicates(['first_name'],keep='last'))


  first_name last_name      age  preTestScore  postTestScore
0      Jason    Miller       42             4             25
1      Jason    Miller       42             4             25
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70

  first_name last_name      age  preTestScore  postTestScore
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70


# How to select rows with multiple filters?

In [58]:
# Create an example dataframe
data = {'name': ['A', 'B', 'C', 'D', 'E'],
        'score': [1,2,3,4,5]}

df = pd.DataFrame(data)
print(); print(df)

# Select rows of the dataframe where df.score is greater than 1 and less and 5
print(); print(df[(df['score']>1) & (df['score']<5)])


  name  score
0    A      1
1    B      2
2    C      3
3    D      4
4    E      5

  name  score
1    B      2
2    C      3
3    D      4


# How to rank a Pandas DataFrame?

In [62]:
# load libraries
import pandas as pd

# Create dataframe
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'year': [2012, 2012, 2013, 2014, 2014],
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}

df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
print(); print(df)

# Create a new column that is the rank of the value of coverage in ascending order
df['new_coverage'] = df['coverage'].rank(ascending=True)

print(); print(df)



             name  year  reports  coverage
Cochice     Jason  2012        4        25
Pima        Molly  2012       24        94
Santa Cruz   Tina  2013       31        57
Maricopa     Jake  2014        2        62
Yuma          Amy  2014        3        70

             name  year  reports  coverage  new_coverage
Cochice     Jason  2012        4        25           1.0
Pima        Molly  2012       24        94           5.0
Santa Cruz   Tina  2013       31        57           2.0
Maricopa     Jake  2014        2        62           3.0
Yuma          Amy  2014        3        70           4.0


In [66]:
# Create a new column that is the rank of the value of coverage in descending order
df['new_covearge1'] = df['coverage'].rank(ascending = False)
print(); print(df)


             name  year  reports  coverage  new_coverage  new_covearge1  \
Cochice     Jason  2012        4        25           1.0            5.0   
Pima        Molly  2012       24        94           5.0            1.0   
Santa Cruz   Tina  2013       31        57           2.0            4.0   
Maricopa     Jake  2014        2        62           3.0            3.0   
Yuma          Amy  2014        3        70           4.0            2.0   

            new_covearge  
Cochice              5.0  
Pima                 1.0  
Santa Cruz           4.0  
Maricopa             3.0  
Yuma                 2.0  


# How to JOIN and MERGE Pandas DataFrame?

In [69]:
# Create a dataframe
raw_data = {'subject_id': ['1', '2', '3', '4', '5'],
            'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
            'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
print(); print(df_a)

# Create a second dataframe
raw_data = {'subject_id': ['4', '5', '6', '7', '8'],
            'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
            'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
print(); print(df_b)

# Create a third dataframe
raw_data = {'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
            'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
print(); print(df_n)

# Join the two dataframes along rows
df_join = pd.concat([df_a,df_b])
print(); print(df_join)


  subject_id first_name last_name
0          1       Alex  Anderson
1          2        Amy  Ackerman
2          3      Allen       Ali
3          4      Alice      Aoni
4          5     Ayoung   Atiches

  subject_id first_name last_name
0          4      Billy    Bonder
1          5      Brian     Black
2          6       Bran   Balwner
3          7      Bryce     Brice
4          8      Betty    Btisan

  subject_id  test_id
0          1       51
1          2       15
2          3       15
3          4       61
4          5       16
5          7       14
6          8       15
7          9        1
8         10       61
9         11       16

  subject_id first_name last_name
0          1       Alex  Anderson
1          2        Amy  Ackerman
2          3      Allen       Ali
3          4      Alice      Aoni
4          5     Ayoung   Atiches
0          4      Billy    Bonder
1          5      Brian     Black
2          6       Bran   Balwner
3          7      Bryce     Brice
4     

In [75]:
# Join the two dataframes along columns
df_col = pd.concat([df_a,df_b],axis =1)
print(df_col)

  subject_id first_name last_name subject_id first_name last_name
0          1       Alex  Anderson          4      Billy    Bonder
1          2        Amy  Ackerman          5      Brian     Black
2          3      Allen       Ali          6       Bran   Balwner
3          4      Alice      Aoni          7      Bryce     Brice
4          5     Ayoung   Atiches          8      Betty    Btisan


In [79]:
# Merge two dataframes along the subject_id value
df_merge = pd.merge(df_a,df_b,on='subject_id')
print(df_merge)

  subject_id first_name_x last_name_x first_name_y last_name_y
0          4        Alice        Aoni        Billy      Bonder
1          5       Ayoung     Atiches        Brian       Black


In [85]:
# Merge two dataframes with both the left and right dataframes using the subject_id key
pd_left_right = pd.merge(df_join,df_n,left_on='subject_id',right_on='subject_id')
print(pd_left_right)

  subject_id first_name last_name  test_id
0          1       Alex  Anderson       51
1          2        Amy  Ackerman       15
2          3      Allen       Ali       15
3          4      Alice      Aoni       61
4          4      Billy    Bonder       61
5          5     Ayoung   Atiches       16
6          5      Brian     Black       16
7          7      Bryce     Brice       14
8          8      Betty    Btisan       15


In [91]:
# Merge with outer join
df_outer = pd.merge(df_a,df_b,on ='subject_id',how='outer')
print();print(df_outer)




  subject_id first_name_x last_name_x first_name_y last_name_y
0          1         Alex    Anderson          NaN         NaN
1          2          Amy    Ackerman          NaN         NaN
2          3        Allen         Ali          NaN         NaN
3          4        Alice        Aoni        Billy      Bonder
4          5       Ayoung     Atiches        Brian       Black
5          6          NaN         NaN         Bran     Balwner
6          7          NaN         NaN        Bryce       Brice
7          8          NaN         NaN        Betty      Btisan


In [92]:
# Merge with inner join
df_inner= pd.merge(df_a,df_b,on ='subject_id',how='inner')
print();print(df_inner)


  subject_id first_name_x last_name_x first_name_y last_name_y
0          4        Alice        Aoni        Billy      Bonder
1          5       Ayoung     Atiches        Brian       Black


# How to JOIN and MERGE Pandas DataFrame?

In [94]:
#load Libraries
from datetime import datetime
from dateutil.parser import parse
import pandas as pd

# Create a string variable with a datetime
date_start = '2012-03-03'

In [95]:
# Convert the string to datetime format
print()
print(datetime.strptime(date_start, '%Y-%m-%d'))


2012-03-03 00:00:00


In [96]:
# Create a list of strings as dates
dates = ['7/2/2017', '8/6/2016', '11/13/2015', '5/26/2014', '5/2/2013']
# Use parse() to attempt to auto-convert common string formats
print()
print(parse(date_start))


2012-03-03 00:00:00


In [98]:
print()
print([parse(x) for x in dates])
# Use parse, but designate that the day is first
print()
print(parse(date_start, dayfirst=True))


[datetime.datetime(2017, 7, 2, 0, 0), datetime.datetime(2016, 8, 6, 0, 0), datetime.datetime(2015, 11, 13, 0, 0), datetime.datetime(2014, 5, 26, 0, 0), datetime.datetime(2013, 5, 2, 0, 0)]

2012-03-03 00:00:00


In [None]:
# Create a dataframe
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994',
                 '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071',
                 '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592',
                 '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109',
                 '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
        'value': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
df = pd.DataFrame(data, columns = ['date', 'value'])
print(df.dtypes)
# Convert df['date'] from string to datetime
print()
print(pd.to_datetime(df['date']))
print(pd.to_datetime(df['date']).dtypes)