In [2]:
import os
import pandas as pd
import altair as alt

In [3]:
ROOT_DIR = os.path.realpath(os.path.join(os.getcwd(), '..'))
all_features = pd.DataFrame()
fea_dir = os.path.join(ROOT_DIR, 'final')
for file in os.scandir(fea_dir):
    if '_feature' in file.name:
        temp_df = pd.read_csv(file)
        temp_df['source'] = file.name.split('.')[0]
        temp_df['station'] = temp_df['Feature'].str[5:]
        all_features = pd.concat([all_features,temp_df])
stations = os.path.join(ROOT_DIR, 'station_analysis','AAStation.csv')
stations = pd.read_csv(stations,dtype=str)
stations = stations[['id','name']]
stations.columns = ['station','name']
all_features = all_features.merge(stations, on='station')
all_features['Feature-Name'] = all_features['Feature'].str[:5] + all_features['name']
all_features['Feature-Type'] = all_features['Feature'].str[:4]
all_features

Unnamed: 0,Feature,Importance,source,station,name,Feature-Name,Feature-Type
0,temp_KARB0,0.132924,dbscan_cluster_features,KARB0,Ann Arbor / Pittsfield,temp_Ann Arbor / Pittsfield,temp
1,dwpt_KARB0,0.011382,dbscan_cluster_features,KARB0,Ann Arbor / Pittsfield,dwpt_Ann Arbor / Pittsfield,dwpt
2,pres_KARB0,0.009564,dbscan_cluster_features,KARB0,Ann Arbor / Pittsfield,pres_Ann Arbor / Pittsfield,pres
3,rhum_KARB0,0.003860,dbscan_cluster_features,KARB0,Ann Arbor / Pittsfield,rhum_Ann Arbor / Pittsfield,rhum
4,temp_KARB0,0.000791,db_km_scan_cluster_features,KARB0,Ann Arbor / Pittsfield,temp_Ann Arbor / Pittsfield,temp
...,...,...,...,...,...,...,...
2331,temp_KONZ0,0.000009,full_no-cluster_features,KONZ0,Grosse Ile / Gibraltar,temp_Grosse Ile / Gibraltar,temp
2332,rhum_KONZ0,0.000008,full_no-cluster_features,KONZ0,Grosse Ile / Gibraltar,rhum_Grosse Ile / Gibraltar,rhum
2333,dwpt_KDLZ0,0.000010,full_no-cluster_features,KDLZ0,Delaware / Sunnyview Farms,dwpt_Delaware / Sunnyview Farms,dwpt
2334,rhum_KDLZ0,0.000008,full_no-cluster_features,KDLZ0,Delaware / Sunnyview Farms,rhum_Delaware / Sunnyview Farms,rhum


In [7]:
feature_list = [('full_no-cluster_features',"Extra Tress Regression model"),
                ('initial_cluster_features',"Extra Tress Regression model trimmed with K-means clustering"),
                ('initial_cluster_features2',"Extra Tress Regression model trimmed with K-means clustering"),
                ('dbscan_cluster_features', "Extra Tress Regression model trimmed with DBSCAN clustering"),
                ('db_km_scan_cluster_features',"Extra Tress Regression model trimmed with DBSCAN/K-means clustering")    
                ]
for feature in feature_list:
    df = all_features[all_features['source']==feature[0]].sort_values('Importance',ascending=False)
    df_types = df.groupby(df['Feature-Type']).sum().reset_index(drop=False)
    df_types.sort_values('Importance',ascending=False,inplace=True)
    
    top10_chart = alt.Chart(df[:10]).mark_bar(color='cornflowerblue').encode(
        x=alt.X('Importance:Q', axis=alt.Axis(format="%", tickSize=0, labelFontSize=12)),
        y=alt.Y('Feature-Name:N', 
                title="",
                sort = list(df.Feature[:10]),
                axis=alt.Axis(tickSize=0, labelFontSize=12, labelPadding=10)),
    ).properties(
        height=200,
        title={
          "text": ['Top 10 Most Important Features'], 
          "subtitle": [feature[1]],
          "color": "black",
          "subtitleColor": "gray"
        }
    )

    ft_chart = alt.Chart(df_types).mark_bar(color='cornflowerblue').encode(
        x=alt.X('Importance:Q', axis=alt.Axis(format="%", tickSize=0, labelFontSize=12)),
        y=alt.Y('Feature-Type:N', 
                sort=list(df_types['Feature-Type']), 
                title="",
                axis=alt.Axis(tickSize=0, labelFontSize=12, labelPadding=10)
               ),
    ).properties(
        height=200,
        title={
          "text": ['Most Important Feature Types'], 
          "subtitle": [feature[1]],
          "color": "black",
          "subtitleColor": "gray"
        }
    )

    x = top10_chart|ft_chart
    x.display()