In [1]:
import pandas as pd
import numpy as np

In [None]:
column_types = {
    'passenger_count': np.float32, # we can use float32 for now and then switch it back to int8 when we’re done removing NaN values.
    'payment_type': np.float32, # we can use float32 for now and then switch it back to int8 when we’re done removing NaN values.
    'total_amount': np.float32
}

In [None]:
df = pd.read_csv(
    filepath_or_buffer='../../pandas-workout-data/data/nyc_taxi_2020-01.csv',
    usecols=['passenger_count', 'total_amount', 'payment_type'],
    dtype=column_types
    )

In [None]:
df

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.count()

In [None]:
df.isna().any(axis=0) # Check by column along axis 0

In [None]:
df.isna().any(axis=1) # Check by row along axis 1

In [None]:
df['passenger_count'].isna().any() # Returns True if there's NaN values in the Series *(column)

In [None]:
df['payment_type'].isna().any() # Returns True if there's NaN values in the Series *(column)

In [None]:
df.isna().any(axis=1).value_counts() 

Let's remove the rows with NaN values

In [None]:
df = df.dropna()
df

Even though df.dropna() returns a new data frame, its data may be shared with other data frames for the sake of efficiency. Modifying our data frame may thus result in a SettingWithCopyWarning. To avoid that, we can use the copy method on our data frame to ensure that there isn’t any shared data behind the scenes:

In [None]:
df = df.dropna().copy()
df

f you don’t use copy, you may get the warning, which may be harmless, but it also may mean any changes you make won’t stick.

### Beyond the exercise

Create a data frame from four other columns (VendorID, trip_distance, tip_amount, and total_amount), specifying the dtype for each. What types are most appropriate? Can you use them directly, or must you first clean the data?

In [None]:
dtypes2 = {
    'VendorID': np.float32,
    'trip_distance': np.float32,
    'tip_amount': np.float32,
    'total_amount': np.float32
}

In [None]:
df2 = pd.read_csv(filepath_or_buffer='../../pandas-workout-data/data/nyc_taxi_2020-01.csv',
                usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],
                dtype=dtypes2  
                )
df2

In [None]:
df2.shape

In [None]:
df2.count()

In [None]:
df2.isna().any(axis=0)

Just the vendorID column has null values

In [None]:
df2 = df2.dropna().copy()
df2

In [None]:
df2['VendorID'] = df2['VendorID'].astype(np.int8)
df2