In [247]:
import os, glob
import pandas as pd 
from sklearn.feature_extraction import DictVectorizer

In [248]:
## one-hot-encoding of categorical features
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    """
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

In [249]:
#projectdir = os.path.realpath(__file__).split('src')[0]
projectDir = "/home/ca22119/projects_vm/d3m/sep17eval/emissions/28_car_fuel_emission"
solDir = os.path.join(projectDir, "solution")
dataDir = os.path.join(projectDir, "data")
rawDir = os.path.join(dataDir, "raw_data")

In [250]:
# Load-in all the raw data
trainData = pd.read_csv(os.path.join(dataDir, "trainData.csv.gz"), sep=',', compression='gzip', index_col=0)
testData = pd.read_csv(os.path.join(dataDir, "testData.csv.gz"), sep=',', compression='gzip', index_col=0)

In [251]:
# Load-in all the target labels
trainTargets = pd.read_csv(os.path.join(dataDir, "trainTargets.csv.gz"), sep=',', compression='gzip', index_col=0)
testTargets = pd.read_csv(os.path.join(dataDir, "testTargets.csv.gz"), sep=',', compression='gzip', index_col=0)

In [252]:
# Massage raw-data into dict of dataframes (indexed by raw filename)
r_f_names = trainData['filename'].unique() # raw data files
r_dfs = {}
r_df_columns = None
for r_f_name in r_f_names:
    r_df = pd.read_csv(os.path.join(rawDir, r_f_name), index_col=0)
    r_dfs[r_f_name] = r_df
    r_df_columns = r_df.columns

In [275]:
# Pull-out the training data
X_train = pd.DataFrame(index=trainData.index, columns=r_df_columns)

for row in trainData.iterrows():
    r_index = row[0]
    r_file = row[1].values[0]
    X_train.loc[r_index] = r_dfs[r_file].iloc[r_index]


In [276]:
# Pull-out the testing data
X_test = pd.DataFrame(index=testData.index, columns=r_df_columns)
for row in testData.iterrows():
    r_index = row[0]
    r_file = row[1].values[0]
    X_test.loc[r_index] = r_dfs[r_file].iloc[r_index]

In [277]:
# Drop 'is_tdi' column b/c of leakage
X_train.drop(['is_tdi','engine_capacity','emissions_co_mgkm','emissions_nox_mgkm','metric_urban_cold'],axis=1, inplace=True)
X_test.drop(['is_tdi','engine_capacity','emissions_co_mgkm','emissions_nox_mgkm','metric_urban_cold'],axis=1, inplace=True)
#X_train.drop(['emissions_co_mgkm'],axis=1, inplace=True)
#X_test.drop(['emissions_co_mgkm'],axis=1, inplace=True)


In [278]:
# Re-assign targets for scikit-learn
y_train = trainTargets
y_test = testTargets

In [279]:
# One-Hot Encode Categorical Variables

# simple categorical column detection
cat_cols = []
for index,val in X_train.tail(1).iteritems():
    if isinstance(val.values[0],str): # simple categorical feature detection
        cat_cols.append(index)

# one-hot encode
X_train = encode_onehot(X_train, cat_cols)
X_test = encode_onehot(X_test, cat_cols)

In [280]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train['parent_is_vw'])
print rf.feature_importances_
print list(zip(X_train, rf.feature_importances_))
print('RF score: %f' % rf.fit(X_train, y_train['parent_is_vw']).score(X_test, y_test['parent_is_vw']))

[ 0.32926885  0.28212203  0.01723728  0.02018036  0.01351804  0.08596409
  0.07492854  0.17678082]
[('metric_combined', 0.32926884759109859), ('metric_extra_urban', 0.28212202857870405), ('fuel_type=diesel', 0.017237284588741443), ('fuel_type=hybrid', 0.020180355482376466), ('fuel_type=petrol', 0.013518037057274718), ('trans=auto', 0.085964088003394876), ('trans=manual', 0.074928538331723568), ('trans=semiauto', 0.17678082036668624)]
RF score: 0.852459


In [281]:
from sklearn import metrics
pred = rf.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test['parent_is_vw'], pred[:,1], pos_label=1)
metrics.auc(fpr, tpr)

0.88252590673575126