In [54]:
import pandas as pd
cols=['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels',
'engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders',
'engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg',
      'highway-mpg','price']

auto=pd.read_csv('auto.txt',header=None,names=cols)

In [55]:
len(auto)

205

#### The variable normalised loss contains 49 null values so removing rows will largely reduce the dataset. As loss is not present, we can assume its 0. So null values are replaced to 0 here 

In [56]:
auto['normalized-losses'] = auto['normalized-losses'].str.replace('?','0')
auto['normalized-losses']=pd.to_numeric(auto['normalized-losses'])

#### In case of other variables, replacing by 0 wont make any sense. Each variable has different significance. Also other variables contain relatively very less number of null values. So we can afford to directly remove those rows.

In [69]:
auto = auto.ix[~auto.isin(['?']).any(1)]
len(auto)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


193

#### Converting those continuous variables into numeric type

In [95]:
auto[['bore','stroke','horsepower','peak-rpm','price']] = auto[['bore','stroke','horsepower','peak-rpm','price']].apply(pd.to_numeric)


In [99]:
intCol=[]
strCol=[]
for i in range(0,len(auto.columns)):
    if auto.dtypes[i]=='int64' or auto.dtypes[i]=='float64':
        intCol.append(auto.columns[i])
    else:
        strCol.append(auto.columns[i])

#### As it is not specified in the problem statement that on which basis aggregation function are applied on different columns, I roughly assumed that all aggregation function are applied on all the numeric type variables grouping them by all other variables.
<br>
#### So divided the data on the basis of datatypes of the columns.

#### All the integer columns are applied by aggregation functions like sum, min, max and average. Column names are changed to variable_aggregationFunction

In [163]:
def aggregate_df(df,sumL,meanL,maxL,minL,gpL):
    grp=auto.groupby(strCol).agg(['sum','max','min','mean'])
    grp.columns = ["_".join(x) for x in grp.columns.ravel()]
    return grp

In [164]:
aggregate_df(auto,intCol,intCol,intCol,intCol,strCol)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,symboling_sum,symboling_max,symboling_min,symboling_mean,normalized-losses_sum,normalized-losses_max,normalized-losses_min,normalized-losses_mean,wheel-base_sum,wheel-base_max,...,city-mpg_min,city-mpg_mean,highway-mpg_sum,highway-mpg_max,highway-mpg_min,highway-mpg_mean,price_sum,price_max,price_min,price_mean
make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi,6,3,3,3.000000,0,0,0,0.000000,177.2,88.6,...,21,21.000000,54,27,27,27.000000,29995,16500,13495,14997.500000
alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,1,1,1,1.000000,0,0,0,0.000000,94.5,94.5,...,19,19.000000,26,26,26,26.000000,16500,16500,16500,16500.000000
audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi,2,2,2,2.000000,164,164,164,164.000000,99.4,99.4,...,18,18.000000,22,22,22,22.000000,17450,17450,17450,17450.000000
audi,gas,std,four,sedan,fwd,front,ohc,five,mpfi,1,1,1,1.000000,158,158,158,158.000000,105.8,105.8,...,19,19.000000,25,25,25,25.000000,17710,17710,17710,17710.000000
audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi,2,2,2,2.000000,164,164,164,164.000000,99.8,99.8,...,24,24.000000,30,30,30,30.000000,13950,13950,13950,13950.000000
audi,gas,std,four,wagon,fwd,front,ohc,five,mpfi,1,1,1,1.000000,0,0,0,0.000000,105.8,105.8,...,19,19.000000,25,25,25,25.000000,18920,18920,18920,18920.000000
audi,gas,std,two,sedan,fwd,front,ohc,five,mpfi,2,2,2,2.000000,0,0,0,0.000000,99.8,99.8,...,19,19.000000,25,25,25,25.000000,15250,15250,15250,15250.000000
audi,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi,1,1,1,1.000000,158,158,158,158.000000,105.8,105.8,...,17,17.000000,20,20,20,20.000000,23875,23875,23875,23875.000000
bmw,gas,std,four,sedan,rwd,front,ohc,four,mpfi,0,0,0,0.000000,192,192,192,192.000000,101.2,101.2,...,23,23.000000,29,29,29,29.000000,16925,16925,16925,16925.000000
bmw,gas,std,four,sedan,rwd,front,ohc,six,mpfi,1,1,0,0.250000,188,188,0,47.000000,418.2,110.0,...,15,18.000000,95,28,20,23.750000,113310,36880,21105,28327.500000


#### Above cell shows generated dataframe after aggregation derived from auto dataset