# Pandas: grouping

In [1]:
import pandas as pd
import numpy as np

In [2]:
def col_rename(df, dict_rules, ip=True):
    ''' rename columns in data frame using a dictionary of rules  '''
    if isinstance(df,pd.core.frame.DataFrame) and isinstance(dict_rules,dict):
        z = df.rename(columns = dict_rules, inplace=ip)
    else:
        raise TypeError
    return z
#
def print_unique(df,col):
    ''' gives a list of unique values in a field '''
    if isinstance(df,pd.core.frame.DataFrame) and isinstance(col,pd.core.frame.Series):
        z = df[col].unique()
    else:
        raise TypeError
    return z
#
def lower_case_column_names(df):
    ''' make columns lower case '''
    if isinstance(df,pd.core.frame.DataFrame):
        df.columns= [i.lower() for i in df.columns]
        df.columns = df.columns.str.replace(' ', '_')
    else: 
        raise TypeError
    return df

In [3]:
cars_df = pd.read_csv("data/vehicles.csv")

In [4]:
cars_df.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [5]:
lower_case_column_names(cars_df).head(1)

Unnamed: 0,make,model,year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels/year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams/mile,fuel_cost/year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950


How many Car models? 

In [6]:
num_models=len(cars_df.groupby('model').groups)
print("The number of models is ", num_models)

The number of models is  3608


group by the data by the Make  using count function

In [7]:
cars_by_make_ds = cars_df.groupby('make').count()['model']

Converting Grams/Mile to Grams/Km

1 Mile = 1.60934 Km

Grams/Mile * Mile/Km -> Grams/Mile * 1 Mile/1.60934Km

$$ \frac{Grams}{Mile} * \frac{Mile}{Km} $$

$$ \frac{Grams}{Mile} * \frac{1 Mile}{1.60934Km}  $$

convert MPG columns to km_per_liter

MPG = Miles/Gallon -> Km/Liter

1 Mile = 1.60934 Km

1 Gallon = 3.78541 Liters

$$ \frac{Miles}{Gallon} -> \frac{Miles}{Gallon} * \frac{Km}{Miles} * \frac{Gallon}{Liters}$$

$$ \frac{Miles}{Gallon} -> \frac{Miles}{Gallon} * \frac{1.60934Km}{ 1Miles} * \frac{1 Gallon}{3.78541 Liters}$$

* ( 1.60934 / 3.78541 )


What brand has the most cars?

In [8]:
cars_by_make_ds.sort_values(ascending=False).head(1)

make
Chevrolet    3643
Name: model, dtype: int64

<b>show the average CO2_Emission_Grams/Km  by Brand

In [9]:
mi_to_km = 1.60934
col_rename(cars_df,{'co2_emission_grams/mile':'co2_emission_grams/km'})
cars_df['co2_emission_grams/km'] = cars_df['co2_emission_grams/km']/mi_to_km
cars_df.head()

Unnamed: 0,make,model,year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels/year,city_mpg,highway_mpg,combined_mpg,co2_emission_grams/km,fuel_cost/year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,324.831736,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,424.779962,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,345.133719,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,424.779962,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,345.133719,2550


In [14]:
cars_df.groupby('make').agg(avg_co2_per_gm= \
                            ('co2_emission_grams/km','mean')).sort_values(\
                                'avg_co2_per_gm', ascending=False).apply(np.trunc)

Unnamed: 0_level_0,avg_co2_per_gm
make,Unnamed: 1_level_1
Vector,651.0
Superior Coaches Div E.p. Dutton,552.0
S and S Coach Company E.p. Dutton,552.0
Bugatti,542.0
Laforza Automobile Inc,502.0
...,...
MINI,194.0
Daihatsu,192.0
Fiat,189.0
smart,153.0


In [37]:
mpg_to_kmL = 1.60934/3.78541
col_rename(cars_df,{'city_mpg':'city_kmL'})
cars_df['city_kmL'] = cars_df['city_kmL']*mpg_to_kmL
cars_df.head()

Unnamed: 0,make,model,year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels/year,city_kmL,highway_mpg,combined_mpg,co2_emission_grams/km,fuel_cost/year,decades
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,7.652571,17,17,324.831736,1950,80s
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.526857,13,13,424.779962,2550,80s
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,6.802286,17,16,345.133719,2100,80s
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.526857,13,13,424.779962,2550,80s
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,5.952,21,16,345.133719,2550,80s


# (Optional) 

Use `pd.cut` or `pd.qcut` to create 4 groups (bins) of cars, by Year. We want to explore how cars have evolved decade by decade.

In [16]:
cars_df['year'].describe()

count    35952.00000
mean      2000.71640
std         10.08529
min       1984.00000
25%       1991.00000
50%       2001.00000
75%       2010.00000
max       2017.00000
Name: year, dtype: float64

In [19]:
cars_df['year']

0        1984
1        1984
2        1985
3        1985
4        1987
         ... 
35947    2013
35948    2014
35949    2015
35950    2016
35951    2016
Name: year, Length: 35952, dtype: int64

In [25]:
decade_bins = [1980+i for i in range(0,40,10)]
decade_bins

[1980, 1990, 2000, 2010]

In [29]:
pd.cut(cars_df,

<pandas.core.groupby.generic.SeriesGroupBy object at 0x1623af6d0>

In [34]:
cars_df["decades"] = pd.qcut(cars_df["year"], 
                             q=4,
                             labels=["80s", "90s", "00s", "10s"])
cars_df.groupby("decades")["year"].max()

decades
80s    1991
90s    2001
00s    2010
10s    2017
Name: year, dtype: int64

### Did cars consume more gas in the eighties?

show the average City_Km/Liter by year_range

In [38]:
cars_df.groupby("decades")["city_kmL"].mean()

decades
80s    7.326102
90s    7.210326
00s    7.208441
10s    8.394430
Name: city_kmL, dtype: float64

Which brands are more environment friendly?

In [40]:
cars_df

Unnamed: 0,make,model,year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels/year,city_kmL,highway_mpg,combined_mpg,co2_emission_grams/km,fuel_cost/year,decades
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,7.652571,17,17,324.831736,1950,80s
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.526857,13,13,424.779962,2550,80s
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,6.802286,17,16,345.133719,2100,80s
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.526857,13,13,424.779962,2550,80s
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,5.952000,21,16,345.133719,2550,80s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.454857,38,36,151.614948,1100,10s
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.454857,38,36,150.993575,1100,10s
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.454857,38,36,151.614948,1100,10s
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.454857,39,36,152.857693,1100,10s


In [59]:
cars_df.groupby(["make","decades"],observed=True).mean()['co2_emission_grams/km'].sort_values()#.dropna()

make          decades
Fisker        10s        105.011992
smart         00s        153.392764
              10s        153.543176
Daihatsu      80s        189.308267
Fiat          10s        189.311494
                            ...    
Bugatti       00s        552.213951
Aston Martin  80s        605.564292
Rolls-Royce   80s        615.185717
Vector        90s        651.919248
Lamborghini   80s        721.259038
Name: co2_emission_grams/km, Length: 271, dtype: float64

Does the drivetrain affect fuel consumption?

In [54]:
cars_df.groupby('drivetrain').mean()[['fuel_cost/year','city_kmL']]

Unnamed: 0_level_0,fuel_cost/year,city_kmL
drivetrain,Unnamed: 1_level_1,Unnamed: 2_level_1
2-Wheel Drive,2115.01182,6.64248
"2-Wheel Drive, Front",1450.0,10.628571
4-Wheel Drive,1971.644612,7.190861
4-Wheel or All-Wheel Drive,2165.023835,6.392049
All-Wheel Drive,1863.217263,7.785598
Front-Wheel Drive,1503.131708,9.002214
Part-time 4-Wheel Drive,2131.64557,6.215696
Rear-Wheel Drive,2140.397611,6.556574


Do cars with automatic transmission consume more fuel than cars with manual transmission?

Use `groupby` and `agg` with different aggregation measures for different columns:

aggregate with average City_Km/Liter and the count of the Trans

In [69]:
cars_df.groupby('transmission').count().agg('co2_emission_grams/km').mean().round(0)

799.0

aggregate with average City_Km/Liter and the minimum of the Trans

In [70]:
cars_df.groupby('transmission').count().agg('co2_emission_grams/km').min()

1