In [2]:
# Investigate data types within DataFrame or Series

import pandas as pd

flights = pd.read_csv('flights.csv')
flights.head()

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T10:00:00Z
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T10:00:00Z
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T10:00:00Z
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T10:00:00Z
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T11:00:00Z


In [17]:
# int64
flights.year.dtype
flights.sched_dep_time.dtype
flights.hour.dtype
flights.day.dtype

# float64
flights.dep_time.dtype
flights.arr_delay.dtype

# 'O' Python Object
# note: columns of strings do not have their own type, they are given 'object' type
flights.tailnum.dtype
flights.origin.dtype
flights.dest.dtype
flights.time_hour.dtype

# Check dtype for ALL columns
flights.dtypes

# pandas.core.frame.DataFrame
type(flights)


pandas.core.frame.DataFrame

In [19]:
# convert data types
# convert from int64 to float64

flights.sched_arr_time.astype('float64')

0          819.0
1          830.0
2          850.0
3         1022.0
4          837.0
           ...  
336771    1634.0
336772    2312.0
336773    1330.0
336774    1344.0
336775    1020.0
Name: sched_arr_time, Length: 336776, dtype: float64

In [20]:
# even the DataFrame index has its own data type

flights.index.dtype

dtype('int64')

In [27]:
# MISSING DATA

# Create temporary dataframe with NaN values

import numpy as np

data = {'set_of_numbers': [1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan]}
df = pd.DataFrame(data,columns=['set_of_numbers'])

print(df)

    set_of_numbers
0              1.0
1              2.0
2              3.0
3              4.0
4              5.0
5              NaN
6              6.0
7              7.0
8              NaN
9              NaN
10             8.0
11             9.0
12            10.0
13             NaN


In [29]:
# retrieve all rows that have NaN values - pd.isnull()

df[pd.isnull(df.set_of_numbers)]

# retrieve all rows that do NOT have NaN values - pd.notnull()

df[pd.notnull(df.set_of_numbers)]

Unnamed: 0,set_of_numbers
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
6,6.0
7,7.0
10,8.0
11,9.0
12,10.0


In [31]:
# Replace missing values with "Unknown"

df.set_of_numbers.fillna('Unknown')

0           1
1           2
2           3
3           4
4           5
5     Unknown
6           6
7           7
8     Unknown
9     Unknown
10          8
11          9
12         10
13    Unknown
Name: set_of_numbers, dtype: object

In [33]:
# Replace a non-null value with another value with .replace()

df.set_of_numbers.replace(10.0, "ten")

0       1
1       2
2       3
3       4
4       5
5     NaN
6       6
7       7
8     NaN
9     NaN
10      8
11      9
12    ten
13    NaN
Name: set_of_numbers, dtype: object