# Fixing `cyl` Data Type
- 2008: extract int from string
- 2018: convert float to int

Load datasets `data_08_v2.csv` and `data_18_v2.csv`. You should've created these data files in the previous section: *Filter, Drop Nulls, Dedupe*.

In [1]:
# load datasets
import pandas as pd
df_08 = pd.read_csv('data_08_v2.csv')
df_18 = pd.read_csv('data_18_v2.csv')

In [2]:
# check value counts for the 2008 cyl column
df_08['cyl'].value_counts()

(6 cyl)     409
(4 cyl)     283
(8 cyl)     199
(5 cyl)      48
(12 cyl)     30
(10 cyl)     14
(2 cyl)       2
(16 cyl)      1
Name: cyl, dtype: int64

Read [this](https://stackoverflow.com/questions/35376387/extract-int-from-string-in-pandas) to help you extract ints from strings in Pandas for the next step.

In [15]:
# Extract int from strings in the 2008 cyl column
df_08['cyl'] = df_08['cyl'].str.extract('([0-9]+)').astype(int)

In [16]:
# Check value counts for 2008 cyl column again to confirm the change
df_08.cyl

0       6
1       4
2       6
3       6
4       6
5       6
6       4
7       4
8      12
9      12
10     12
11     12
12      8
13      8
14      4
15      4
16      6
17      4
18      6
19      6
20      6
21      6
22      6
23      6
24      6
25      6
26      8
27      6
28      8
29      8
       ..
956     5
957     5
958     5
959     5
960     5
961     5
962     5
963     5
964     5
965     5
966     5
967     5
968     5
969     5
970     6
971     6
972     6
973     8
974     5
975     5
976     5
977     5
978     5
979     5
980     5
981     6
982     6
983     6
984     6
985     8
Name: cyl, Length: 986, dtype: int64

In [19]:
# convert 2018 cyl column to int
df_18['cyl'] = df_18['cyl'].astype(int)

In [24]:
df_18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794 entries, 0 to 793
Data columns (total 13 columns):
model                   794 non-null object
displ                   794 non-null float64
cyl                     794 non-null int64
trans                   794 non-null object
drive                   794 non-null object
fuel                    794 non-null object
veh_class               794 non-null object
air_pollution_score     794 non-null int64
city_mpg                794 non-null object
hwy_mpg                 794 non-null object
cmb_mpg                 794 non-null object
greenhouse_gas_score    794 non-null int64
smartway                794 non-null object
dtypes: float64(1), int64(3), object(9)
memory usage: 80.7+ KB


In [33]:
#2008: convert string to float, air_pollution_score column.
#2018: convert int to float.

df_18['air_pollution_score'] = df_18['air_pollution_score'].astype(float)

In [73]:
df_08['air_pollution_score'] = df_08['air_pollution_score'].astype(float, errors = 'ignore')

In [74]:
df_08.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 13 columns):
model                   986 non-null object
displ                   986 non-null float64
cyl                     986 non-null int64
trans                   986 non-null object
drive                   986 non-null object
fuel                    986 non-null object
veh_class               986 non-null object
air_pollution_score     986 non-null object
city_mpg                986 non-null object
hwy_mpg                 986 non-null object
cmb_mpg                 986 non-null object
greenhouse_gas_score    986 non-null object
smartway                986 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 100.2+ KB


In [71]:
# 2008 and 2018: convert string to float.
# city_mpg, hwy_mpg, cmb_mpg 
df_08['city_mpg'] = df_08['city_mpg'].astype(float, errors = 'ignore', inplace = True)
df_08['hwy_mpg'] = df_08['hwy_mpg'].astype(float, errors = 'ignore')
df_08['cmb_mpg'] = df_08['cmb_mpg'].astype(float, errors = 'ignore')


In [79]:
df_18['city_mpg'] = df_18['city_mpg'].astype(float, errors = 'ignore')
df_18['hwy_mpg'] = df_18['hwy_mpg'].astype(float,errors = 'ignore')
df_18['cmb_mpg'] = df_18['cmb_mpg'].astype(float, errors = 'ignore')


In [89]:
#2008: convert from float to int greenhouse_gas_score
df_08['greenhouse_gas_score'] = df_08['greenhouse_gas_score'].astype(int, errors = 'ignore')

In [88]:
df_08.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 13 columns):
model                   986 non-null object
displ                   986 non-null float64
cyl                     986 non-null int64
trans                   986 non-null object
drive                   986 non-null object
fuel                    986 non-null object
veh_class               986 non-null object
air_pollution_score     986 non-null object
city_mpg                986 non-null object
hwy_mpg                 986 non-null object
cmb_mpg                 986 non-null object
greenhouse_gas_score    986 non-null object
smartway                986 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 100.2+ KB


In [None]:
df_08.to_csv('data_08_v3.csv', index=False)
df_18.to_csv('data_18_v3.csv', index=False)