In [1]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

##### In a two dimensional numpy array the way to index data is first by row and then by column.
There is no column names or label indexes in 2D numpy arrays as there is in pandas Dataframes.

In [6]:
# Accessing elements
print(ridership[1, 3])
print(ridership[1:3, 3:5])
print(ridership[1, :])

2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]


##### Vectorized operations must be made either by equal length rows or equal length columns.

In [7]:
# Vectorized operations on rows or columns
print(ridership[0, :] + ridership[1, :])
print(ridership[:, 0] + ridership[:, 1])

[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [8]:
# Vectorized operations on entire arrays
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
print(a + b)

[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]


In [16]:
np.argmax(ridership[0]) # Get the index of the maximum value in the spcified array's row

3

In [15]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    overall_mean = ridership.mean()
    mean_for_max = ridership[:, np.argmax(ridership[0])].mean()
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership)

(2342.6, 3239.9)

### Numpy Axis

In [17]:
# NumPy axis argument
a = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

##### Axis 0 is vertically and axis 1 is horizontally

In [19]:
print(a.sum())
print(a.sum(axis=0)) # Sum is made on the columns
print(a.sum(axis=1)) # Sum is made on the rows

45
[12 15 18]
[ 6 15 24]


In [30]:
import my_decorators as mydec

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

@mydec.timer
def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    station = 0
    means = list()
    while station < len(ridership[0]):
        means.append(ridership[:, station].mean())
        station += 1

    max_daily_ridership = max(means)
    min_daily_ridership = min(means)
    
    return (max_daily_ridership, min_daily_ridership)

min_and_max_riders_per_day(np.array(ridership))

Execution time: 0.0


(3239.9, 1071.2)

In [32]:
@mydec.timer
def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    means = ridership.mean(axis=0)
    max_daily_ridership = max(means)
    min_daily_ridership = min(means)
    
    return (max_daily_ridership, min_daily_ridership)

min_and_max_riders_per_day(ridership)

Execution time: 0.0


(3239.9, 1071.2)

##### Accessing elements of a dataframe

In [46]:
import pandas as pd

# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

In [47]:
# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print(df_1)

# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
print(df_2)

   A  B
0  0  3
1  1  4
2  2  5
   A  B  C
0  0  1  2
1  3  4  5


In [48]:
# Accessing elements
print(ridership_df.iloc[0]) # Access row by index
print(ridership_df.loc['05-05-11']) # Access row by index label
print(ridership_df['R003']) # Access column
print(ridership_df.iloc[1, 3]) # Access first row then column

R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64
R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64
05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64
2328


In [49]:
print(ridership_df.iloc[1:4])

          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613


In [50]:
print(ridership_df[['R003', 'R005']])

          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009


In [51]:
df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print(df.sum())
print(df.sum(axis=1))
print(df.values.sum())

A     3
B    12
dtype: int64
0    3
1    5
2    7
dtype: int64
15


In [65]:
ridership_df.iloc[0].idxmax() # Get the column name with the max element in the list

'R006'

In [74]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    # Get the station "column" with the maximum riders on the first day
    max_rid_first_day = ridership.iloc[0].idxmax()
    # Get the mean across both axes
    overall_mean = ridership.mean(axis=None)
    # Get the mean of that station with max riders on 1st day
    mean_for_max = ridership[max_rid_first_day].mean()
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership_df)

(2342.6, 3239.9)

##### Loading data into a DataFrame
Calculating correlation

In [79]:
x = pd.Series([1, 2, 3, 4])
y = pd.Series([10, 11, 12, 13])

mean_x = x.mean()
mean_y = y.mean()

diff_x = x - mean_x
diff_y = y - mean_y

sum_product = (diff_x * diff_y).sum()

sum_sq_x = (diff_x ** 2).sum()
sum_sq_y = (diff_y ** 2).sum()

correlation = sum_product / (sum_sq_x * sum_sq_y)**0.5
correlation

1.0

##### DataFrame vectorized operations

In [82]:
# Examples of vectorized operations on DataFrames:

# Adding DataFrames with the column names
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
print(df1 + df2)

    a   b   c
0  11  44  77
1  22  55  88
2  33  66  99


In [83]:
# Adding DataFrames with overlapping column names 
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
print(df1 + df2)

    a   b   c   d
0 NaN  74  47 NaN
1 NaN  85  58 NaN
2 NaN  96  69 NaN


In [84]:
# Adding DataFrames with overlapping row indexes
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                   index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                   index=['row4', 'row3', 'row2'])
print(df1 + df2)

         a     b     c
row1   NaN   NaN   NaN
row2  32.0  65.0  98.0
row3  23.0  56.0  89.0
row4   NaN   NaN   NaN


In [86]:
# Cumulative entries and exits for one station for a few hours.
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})

def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    print(entries_and_exits.diff())
    return entries_and_exits['ENTRIESn'] - entries_and_exits['EXITSn']
    
get_hourly_entries_and_exits(entries_and_exits)

   ENTRIESn  EXITSn
0       NaN     NaN
1      23.0     8.0
2      18.0    18.0
3      71.0    54.0
4     170.0    44.0
5     214.0    42.0
6      87.0    11.0
7      10.0     3.0
8      36.0    89.0
9     153.0   333.0


0    2056161
1    2056176
2    2056176
3    2056193
4    2056319
5    2056491
6    2056567
7    2056574
8    2056521
9    2056341
dtype: int64

##### DataFrame map()
Runs a function that accepts and return a scalar value. Therefore when the map method is called from a pandas dataframe it behaves on every element in a DataFrame.

In [52]:
import pandas as pd

# DataFrame map()
df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [10, 20, 30],
    'c': [5, 10, 15]
})
    
def add_one(x):
    return x + 1
    
print(df.map(add_one))

   a   b   c
0  2  11   6
1  3  21  11
2  4  31  16


In [91]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

letters = 'A', 'B', 'C', 'D', 'F'
ranges = ((90, 100), (80, 89), (70, 79), (60, 69), (0, 59))
    
def convert_grades(grades):
    '''
    Fill in this function to convert the given DataFrame of numerical
    grades to letter grades. Return a new DataFrame with the converted
    grade.
    
    The conversion rule is:
        90-100 -> A
        80-89  -> B
        70-79  -> C
        60-69  -> D
        0-59   -> F
    '''
    # As pd.map() method works with each DataFrame item at a time, we defined a helper function 
    # that receives one scalar value and returns it's category based on that value.
    return grades.map(evaluate_grade)

# Helper function
def evaluate_grade(grade: int) -> str:
    '''Evaluates whether a value is within a range and return it's category.'''
    if grade >= 90: return 'A'
    elif grade >= 80: return 'B'
    elif grade >= 70: return 'C'
    elif grade >= 60: return 'D'
    elif grade >= 0: return 'F'

convert_grades(grades_df)

#grades_df.map(convert_grades)

Unnamed: 0,exam1,exam2
Andre,F,F
Barry,B,D
Chris,C,F
Dan,C,F
Emilio,B,D
Fred,C,F
Greta,A,C
Humbert,D,F
Ivan,A,C
James,B,D


##### DataFrame apply()

In [53]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

# DataFrame apply() - use case 2
print(df.apply(np.mean))
print(df.apply(np.max))

a     3.0
b    30.0
c    15.0
dtype: float64
a     5
b    50
c    25
dtype: int64


In [54]:
def second_largest(df: pd.DataFrame) -> int:
    '''
    Return the second-largest value of  
    the DataFrame's column.
    '''
    for col in df.columns:
        df[col] = df[col].drop(df[col].argmax())
    return df.apply(np.max)

# Apply works on each column of the dataframe. It means the function passed into the 
# apply function must work with one dimensional arrays (numpy arrays, lists, pandas series)
second_largest(df)

a     4.0
b    40.0
c    20.0
dtype: float64

In [67]:
import pandas as pd

# Change False to True for each block of code to see what it does

# Adding a Series to a square DataFrame
# The series index is mapped with the column names.
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print(df)
    print('') # Create a blank line between outputs
    print(df + s)
    
# Adding a Series to a one-row DataFrame 
# The series index is mapped with the column names.
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
    
    print(df)
    print('') # Create a blank line between outputs
    print(df + s)

# Adding a Series to a one-column DataFrame
# As the index series is mapped to a dataframe with it's columns, just column '0' in dataframe gets added.All
# the other columns get NaN values.
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10, 20, 30, 40]})
    
    print(df)
    print('') # Create a blank line between outputs
    print(df + s)
    

    
# Adding when DataFrame column names match Series index
# The series is added at the index level. Axis 1
if False:
    s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print(df)
    print('') # Create a blank line between outputs
    print(df + s)
    
# Adding when DataFrame column names don't match Series index
# No value is operated an all elements get NaN
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print(df)
    print('') # Create a blank line between outputs
    print(df + s)

##### Pandas groupby()

In [92]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3 
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# Examine DataFrame
print(example_df)

   value   even  above_three
a      1  False        False
b      3  False        False
c      2   True        False
d      4   True         True
e      1  False        False
f      6   True         True
g      4   True         True


In [103]:
print(example_df.groupby('even'))
print(example_df.groupby('even')['value'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017A8491AF50>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000017A83BD8850>


In [93]:
# Examine groups
grouped_data = example_df.groupby('even')
# The groups attribute is a dictionary mapping keys to lists of row indexes
print(grouped_data.groups)

{False: ['a', 'b', 'e'], True: ['c', 'd', 'f', 'g']}


In [94]:
# Group by multiple columns
grouped_data = example_df.groupby(['even', 'above_three'])
print(grouped_data.groups)

{(False, False): ['a', 'b', 'e'], (True, False): ['c'], (True, True): ['d', 'f', 'g']}


In [95]:
# Get sum of each group
grouped_data = example_df.groupby('even')
print(grouped_data.sum())

       value  above_three
even                     
False      5            0
True      16            3


In [97]:
grouped_data = example_df.groupby('even')

# You can take one or more columns from the result DataFrame
print(grouped_data.sum()['value'])

print('\n') # Blank line to separate results

# You can also take a subset of columns from the grouped data before 
# collapsing to a DataFrame. In this case, the result is the same.
print(grouped_data['value'].sum())

even
False     5
True     16
Name: value, dtype: int32


even
False     5
True     16
Name: value, dtype: int32


In [101]:
grph_dsgn = pd.read_csv('./Datasets/GraphicDesign.csv')

grph_dsgn.head()
grph_dsgn.groupby('isPaid')['numSubscribers'].sum()

isPaid
False    284821
True     778327
Name: numSubscribers, dtype: int64

##### Quiz

In [None]:
import numpy as np
import pandas as pd

values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3 
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# Standardize each group
def standardize(xs):
    return (xs - xs.mean()) / xs.std()
grouped_data = example_df.groupby('even')
print(grouped_data['value'].apply(standardize))
    
# Find second largest value in each group
def second_largest(xs):
    sorted_xs = xs.sort(inplace=False, ascending=False)
    return sorted_xs.iloc[1]
grouped_data = example_df.groupby('even')
print(grouped_data['value'].apply(second_largest))

In [105]:
# DataFrame with cumulative entries and exits for multiple stations
ridership_df = pd.DataFrame({
    'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
    'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
    'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
    'EXITSn': [1088151, 13755385,  1088159, 13755393,  1088177, 13755598, 1088231, 13756191,  1088275]
})

def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits and return a DataFrame with hourly entries and exits.
    The hourly entries and exits should be calculated separately for
    each station (the 'UNIT' column).
    
    Hint: Take a look at the `get_hourly_entries_and_exits()` function
    you wrote in a previous quiz, DataFrame Vectorized Operations. If
    you copy it here and rename it, you can use it and the `.apply()`
    function to help solve this problem.
    '''
    group = entries_and_exits.groupby('UNIT')
    pass

In [115]:
subway_df = pd.DataFrame({
    'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
             'R004', 'R004'],
    'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
              '05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
    'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'ENTRIESn': [ 4388333,  4388348,  4389885,  4391507,  4393043, 14656120,
                 14656174, 14660126, 14664247, 14668301],
    'EXITSn': [ 2911002,  2911036,  2912127,  2913223,  2914284, 14451774,
               14451851, 14454734, 14457780, 14460818],
    'latitude': [ 40.689945,  40.689945,  40.689945,  40.689945,  40.689945,
                  40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ],
    'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
                  -73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
})

weather_df = pd.DataFrame({
    'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
              '05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
    'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'latitude': [ 40.689945,  40.69132 ,  40.689945,  40.69132 ,  40.689945,
                  40.69132 ,  40.689945,  40.69132 ,  40.689945,  40.69132 ],
    'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
                  -73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
    'pressurei': [ 30.24,  30.24,  30.32,  30.32,  30.14,  30.14,  29.98,  29.98,
                   30.01,  30.01],
    'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'tempi': [ 52. ,  52. ,  48.9,  48.9,  54. ,  54. ,  57.2,  57.2,  48.9,  48.9],
    'wspdi': [  8.1,   8.1,   6.9,   6.9,   3.5,   3.5,  15. ,  15. ,  15. ,  15. ]
})

subway_df.combine(weather_df, lambda s1, s2: s1)

Unnamed: 0,DATEn,ENTRIESn,EXITSn,UNIT,fog,hour,latitude,longitude,pressurei,rain,tempi,wspdi
0,05-01-11,4388333.0,2911002.0,R003,,0,40.689945,-73.872564,,,,
1,05-02-11,4388348.0,2911036.0,R003,,0,40.689945,-73.872564,,,,
2,05-03-11,4389885.0,2912127.0,R003,,0,40.689945,-73.872564,,,,
3,05-04-11,4391507.0,2913223.0,R003,,0,40.689945,-73.872564,,,,
4,05-05-11,4393043.0,2914284.0,R003,,0,40.689945,-73.872564,,,,
5,05-01-11,14656120.0,14451774.0,R004,,0,40.69132,-73.867135,,,,
6,05-02-11,14656174.0,14451851.0,R004,,0,40.69132,-73.867135,,,,
7,05-03-11,14660126.0,14454734.0,R004,,0,40.69132,-73.867135,,,,
8,05-04-11,14664247.0,14457780.0,R004,,0,40.69132,-73.867135,,,,
9,05-05-11,14668301.0,14460818.0,R004,,0,40.69132,-73.867135,,,,


In [113]:
help(pd.DataFrame.combine)

Help on function combine in module pandas.core.frame:

combine(self, other: 'DataFrame', func: 'Callable[[Series, Series], Series | Hashable]', fill_value=None, overwrite: 'bool' = True) -> 'DataFrame'
    Perform column-wise combine with another DataFrame.
    
    Combines a DataFrame with `other` DataFrame using `func`
    to element-wise combine columns. The row and column indexes of the
    resulting DataFrame will be the union of the two.
    
    Parameters
    ----------
    other : DataFrame
        The DataFrame to merge column-wise.
    func : function
        Function that takes two series as inputs and return a Series or a
        scalar. Used to merge the two dataframes column by columns.
    fill_value : scalar value, default None
        The value to fill NaNs with prior to passing any column to the
        merge func.
    overwrite : bool, default True
        If True, columns in `self` that do not exist in `other` will be
        overwritten with NaNs.
    
    Return