In [None]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import numpy as np
% matplotlib inline

In [None]:
##### DATA IMPORT AND CLEANING/FORMATTING

In [None]:
discovery_URI = "http://api.us.socrata.com/api/catalog/v1?"

In [None]:
# create list of city open data portal domains
domain_list = ['data.sfgov.org', 'opendata.lasvegasnevada.gov', 'data.cityofnewyork.us', 'data.cityofchicago.org', 
               'data.austintexas.gov', 'data.lacity.org', 'data.smgov.net', 'data.muni.org', 'data.brla.gov',
              'data.seattle.gov', 'data.chattlibrary.org', 'data.vbgov.com', 'data.providenceri.gov', 
               'www.dallasopendata.com', 'data.fortworthtexas.gov']

In [None]:
def get_socrata_data(city_domain):
    """Requests Socrata metadata on datasets at input domain name,
    converts the response to a dictionary, and returns the dictionary.
    """
    params = {'domains': city_domain, 'limit': 10000}
    socrata_response = requests.get(discovery_URI, params = params)
    # convert response from JSON string to Python list
    metadata_list = socrata_response.json()
    return metadata_list        

In [None]:
def create_city_list(city_result):
    """For each dataset metadata item in input list of API results, extracts relevant fields, creates
    dictionary of values, appends dictionary to list, and returns the list.
    """
    city_list = []
    datasets = city_result['resultSetSize']
    for d in range(datasets):
            data_dict = {}
            data_dict['num_city_datasets'] = city_result['resultSetSize']
            data_dict['categories'] = city_result['results'][d]['classification']['categories']
            if 'domain_category' in city_result['results'][d]['classification'].keys():
                data_dict['domain_category'] = city_result['results'][d]['classification']['domain_category']
            else:
                data_dict['domain_category'] = 'None'
            data_dict['domain'] = city_result['results'][d]['metadata']['domain']
            data_dict['name'] = city_result['results'][d]['resource']['name']
            data_dict['id'] = city_result['results'][d]['resource']['id']
            data_dict['createdAt'] = city_result['results'][d]['resource']['createdAt']
            data_dict['page_views_total'] = city_result['results'][d]['resource']['page_views']['page_views_total']
            data_dict['download_count'] = city_result['results'][d]['resource']['download_count']
            data_dict['type'] = city_result['results'][d]['resource']['type']
            data_dict['provenance'] = city_result['results'][d]['resource']['provenance']
            # datasets without a license lack a license field in the Socrata metadata
            if 'license' in city_result['results'][d]['metadata'].keys():
                data_dict['license'] = city_result['results'][d]['metadata']['license']
            else:
                data_dict['license'] = 'None'
            city_list.append(data_dict)
    return city_list

In [None]:
def all_city_datasets(domain_list):
    """For each domain in input list, retrieves Socrata metadata on datasets at input domain name,
    converts the response to a dictionary, extracts relevant fields for each dataset metadata item,
    and creates a list of dictionaries for all dataset metadata items across domains.
    """
    master_list = []
    for dom in domain_list:
        dom_result = get_socrata_data(dom)
        dom_list = create_city_list(dom_result)
        master_list = master_list + dom_list
    return master_list

In [None]:
# create master list of dictionaries of dataset metadata across all domains
cities = all_city_datasets(domain_list)

In [None]:
# convert master list into dataframe
cities_df = pd.DataFrame(cities)

In [None]:
# convert createdAt column to datetime format
cities_df['datetime_created'] = pd.to_datetime(cities_df['createdAt'])

# create column for year created
cities_df['year_created'] = cities_df.datetime_created.dt.year

# create column for month created
cities_df['month_created'] = cities_df.datetime_created.dt.month

In [None]:
# create dictionary of city name, population, and Open Data Census rank
domain_dict = {'data.sfgov.org': ['San Francisco', 870887, 1], 'opendata.lasvegasnevada.gov':['Las Vegas', 632912, 2],
               'data.cityofnewyork.us':['New York City', 8537673, 3], 'data.cityofchicago.org':['Chicago', 2704958, 4], 
               'data.austintexas.gov':['Austin', 947890, 5], 'data.lacity.org':['Los Angeles', 3976322, 6], 
               'data.smgov.net':['Santa Monica', 92478, 7], 'data.muni.org':['Anchorage', 298192, 10], 
               'data.brla.gov':['Baton Rouge', 227715, 11], 'data.seattle.gov':['Seattle', 704352, 12], 
               'data.chattlibrary.org':['Chattanooga', 177571, 14], 'data.vbgov.com':['Virginia Beach', 452602, 16], 
               'data.providenceri.gov':['Providence', 179219, 19], 'www.dallasopendata.com':['Dallas', 1317929, 35], 
               'data.fortworthtexas.gov':['Fort Worth', 854113, 0]}

In [None]:
# add columns for city name and city population
cities_df["city_info"] = cities_df["domain"].map(domain_dict)
cities_df["city_name"] = [x[0] for x in cities_df["city_info"]]
cities_df["city_population"] = [x[1] for x in cities_df["city_info"]]
cities_df["city_census_rank"] = [x[2] for x in cities_df["city_info"]]
# remove combined column
cities_df = cities_df.drop('city_info', axis = 1)

In [None]:
# check dataframe column names
cities_df.columns

In [None]:
cities_df.head()

In [None]:
# create dummy variables for categories column
cat_dummies = cities_df.categories.str.join('|').str.get_dummies().add_prefix('cat_')
cat_dummies.columns

In [None]:
# join category dummy variable columns to dataframe
cities_df = cities_df.join(cat_dummies)

In [None]:
cities_df.columns

In [None]:
cities_df['pageviews_by_pop'] = (cities_df['page_views_total']/cities_df['city_population'])*100000

In [None]:
cities_df['downloads_by_pop'] = (cities_df['download_count']/cities_df['city_population'])*100000

In [None]:
cities_df['downloads_by_pop'].describe()

In [None]:
# examine range of page_views & download counts
city_grp = cities_df.groupby('city_name')
city_grp[['download_count', 'page_views_total']].describe()

In [None]:
#### EXPLORATORY ANALYSIS

In [None]:
# check number of dataset metadata records in dataframe 
len(cities_df)

In [None]:
# check license types 
cities_df.license.unique()

In [None]:
# create dictionary that categorizes licenses by rights type 
rights_dict = {'Open Data Commons Public Domain Dedication and License': 'Public Domain', 
               'Public Domain U.S. Government': 'Public Domain', 'None': 'None', 
               'Open Data Commons Open Database License': 'Attribution_Sharealike', 'Public Domain': 'Public Domain',
               'Creative Commons Attribution 4.0 International': 'Attribution', 'Open Database License': 'Attribution_Sharealike',
               'Creative Commons 1.0 Universal (Public Domain Dedication)': 'Public Domain',
               'See Terms of Use': 'None', 'Open Data Commons Attribution License': 'Attribution'}

In [None]:
# create license type column in dataframe by mapping license types from dictionary
cities_df["license_type"] = cities_df["license"].map(rights_dict)
cities_df.license_type.unique()

In [None]:
# number of datasets without license specified
len(cities_df[cities_df.license=='None'])

In [None]:
license_groups = cities_df.groupby('license_type').size()
type_dict = license_groups.T.to_dict()
type_dict

In [None]:
# bar chart of license type frequency
license_groups.plot.bar(color = 'b', edgecolor = 'black')
plt.xlabel('License Type')
plt.ylabel('Number')
plt.title('Counts of Licenses by Type')

In [None]:
from bokeh.models import ColumnDataSource
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.core.properties import value
output_notebook()

In [None]:
lic_types = list(type_dict.keys())
type_counts = list(type_dict.values())

In [None]:
# Bokeh bar chart of license type frequency
from bokeh.palettes import Viridis4
source = ColumnDataSource(data=dict(lic_types=lic_types, counts=type_counts, color=Viridis4))

p = figure(x_range=lic_types, plot_height=250, y_range=(0, 6000), title="License Type Counts", toolbar_location = "above")
p.vbar(x='lic_types', top='counts', width=0.9, color='color', legend="lic_types", source=source)

p.xgrid.grid_line_color = None
p.legend.orientation = "vertical"
p.legend.location = "top_left"

show(p)

In [None]:
## basic bar chart of license type counts adapted from Bokeh example

# Here is a list of categorical values (or factors)
types = list(type_dict.keys())
counts = list(type_dict.values())

# Set the x_range to the list of categories above
p = figure(x_range=types, plot_height=250, title="License Type Counts")

# Categorical values can also be used as coordinates
p.vbar(x=types, top=counts, width=0.9)

# Set some properties to make the plot look better
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [None]:
# stacked bar chart of license types grouped by city
## do this as a percentage - i.e. license type %

fig2 = (cities_df
 .groupby(['city_name', 'license_type'])
 .size()
 .unstack()
 .plot.bar(stacked=True)
)

In [None]:
#import altair
import altair as alt
# enable notebook display
alt.renderers.enable('notebook')
# increase max rows size of source data
alt.data_transformers.enable('default', max_rows=1000000)

In [None]:
# altair basic license type frequency bar chart
## SEE EXAMPLE BELOW RE HOW TO SET COLORS
alt.Chart(cities_df, width = 300).mark_bar(size = 40).encode(
    x = 'license_type',
    y = 'count()',
    color = alt.Color('license_type', legend=alt.Legend(title='License Type'))
).configure_bar(opacity = 0.8)

In [None]:
# altair stacked bar chart - license type frequencies by city
alt.Chart(cities_df).mark_bar().encode(
    x = 'city_name:N',
    y = 'count(license_type):Q',
    color = alt.Color('license_type', legend=alt.Legend(title='License Type'), scale=alt.Scale(range=['#440154', '#30678D', '#35B778', '#FDE724'])),
    order = alt.Order('license_type', sort = 'ascending')
).configure_bar(opacity = 0.8)

In [None]:
# altair stacked bar chart - license type frequencies by city - Normalized 
alt.Chart(cities_df, height = 300, width = 500).mark_bar().encode(
    alt.X('city_name:N', axis = alt.Axis(title = 'City Name')),
    alt.Y('count(license_type):Q', stack = 'normalize', axis=alt.Axis(title='Percentage of Datasets', format = '%')),
    color = alt.Color('license_type', legend=alt.Legend(title='License Type'), scale=alt.Scale(range=['#440154', '#30678D', '#35B778', '#FDE724'])),
    order = alt.Order('license_type', sort = 'ascending')
).properties(title = "License Type Distribution by City").configure_legend(labelFontSize = 14).configure_axis(labelFontSize = 14, titleFontSize = 14)

In [None]:
import altair as alt
from vega_datasets import data

weather = data.seattle_weather()

alt.Chart(weather).mark_bar().encode(
    alt.Color('weather:N',
        legend=alt.Legend(title='Weather type'),
        scale=alt.Scale(
            domain=['sun', 'fog', 'drizzle', 'rain', 'snow'],
            range=['#e7ba42', '#c7c7c7', '#aec7e8', '#1f77b4', '#9467bd']
        ),
    ),
    alt.X('date:N',
        axis=alt.Axis(title='Month of the Year'),
        timeUnit='month',
    ),
    y='count()',
)

In [None]:
# license types by provenance - official vs community
(cities_df
 .groupby(['provenance', 'license_type'])
 .size()
 .unstack()
 .plot.bar(stacked=True)
)

In [None]:
# altair stacked bar chart - license type frequencies by provenance
alt.Chart(cities_df, height = 300, width = 500).mark_bar().encode(
    alt.X('provenance:N', axis = alt.Axis(title = 'Provenance')),
    alt.Y('count(license_type):Q', axis=alt.Axis(title='Number of Datasets')),
    color = alt.Color('license_type', legend=alt.Legend(title='License Type'), scale=alt.Scale(range=['#440154', '#30678D', '#35B778', '#FDE724'])),
    order = alt.Order('license_type', sort = 'ascending')
).properties(title = "License Type Counts by Provenance").configure_legend(labelFontSize = 14).configure_axis(labelFontSize = 14, titleFontSize = 14)

In [None]:
# license types by year dataset published
(cities_df
 .groupby(['year_created', 'license_type'])
 .size()
 .unstack()
 .plot.bar(stacked=True)
)

In [None]:
year_groups = cities_df.groupby('year_created').size()
# bar chart of number of datasets published by year
year_groups.plot.bar(color = 'b', edgecolor = 'black')
plt.xlabel('Year')
plt.ylabel('Number Published')
plt.title('Counts of Datasets Published by Year')

In [None]:
# boxplots for license type by pageviews and download counts
import numpy as np

log_download_count = cities_df['download_count'].apply(np.log)
log_pageviews = cities_df['page_views_total'].apply(np.log)

box_cities_df = pd.DataFrame({'log_downloadcount':log_download_count, 'log_pageviews': log_pageviews})
box_cities_df['license_type'] = cities_df['license_type']


In [None]:
# boxplots for license type by pageviews and download counts
#plt.style.use('seaborn-poster')

box_cities_df.boxplot(column = 'log_pageviews', by = 'license_type', fontsize = '12', rot = 45)
plt.title('Boxplot - Log Pageviews Grouped by License Type')
plt.ylabel('Log Pageviews')
plt.xlabel('License Type')
plt.suptitle('')

In [None]:
box_cities_df.boxplot(column = 'log_downloadcount', by = 'license_type', fontsize = '10', rot = 45)
plt.title('Boxplot - Log Download Count Grouped by License Type')
plt.ylabel('Log Download Count')
plt.xlabel('License Type')
plt.suptitle('')

In [None]:
# number of datasets by city population size
p = figure(title = "Number of Published Datasets by City Population Size")
p.xaxis.axis_label = 'City Population'
p.yaxis.axis_label = 'Number of Datasets'

p.square(cities_df["city_population"], cities_df["num_city_datasets"],
         color='blue', fill_alpha=0.4, size=8)

show(p)

In [None]:

pred_df.columns

In [None]:
### PREDICTIVE MODELING

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score, train_test_split 
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import tree
from sklearn import preprocessing

In [None]:
# define function to print metrics for model predictions
def printScores(labels, predictions):
    """Prints accuracy score, confusion matrix, and classification report containing
    precision, recall and f-score measures comparing input class predictions with 
    input actual labels.
    """
    print('Accuracy:', accuracy_score(labels, predictions), '\n')
    print('Confusion Matrix:\n', confusion_matrix(labels, predictions))
    print('(Row = Actual, Column = Predicted)', '\n')
    print('Classification Report:\n', classification_report(labels, predictions))

In [None]:
## PREPARE DATA FOR USE IN TREE-BASED MODELS

In [None]:
# drop columns not needed for predictive modeling
pred_df = cities_df.drop(['categories', 'createdAt', 'domain', 'domain_category', 'id', 'license',
                         'name', 'datetime_created', 'city_name', 'download_count', 'page_views_total'], axis = 1)

pred_df.columns

In [None]:
# encode categorical variables
le = preprocessing.LabelEncoder()
pred_df['type'] = le.fit_transform(pred_df['type'].astype(str))
pred_df['provenance'] = le.fit_transform(pred_df['provenance'].astype(str))

In [None]:
# replace missing download_count values with out-of-range value 
#pred_df['download_count'] = pred_df.download_count.fillna(-100)
pred_df['downloads_by_pop'] = pred_df.downloads_by_pop.fillna(-100)

# recheck for missing values
pred_df.isnull().any()

In [None]:
lic_df = pred_df[['license_type', 'city_census_rank', 'city_population', 'type', 'provenance','year_created','downloads_by_pop', 'pageviews_by_pop', 'cat_public safety']]
lic_df = shuffle(lic_df)
lic_df.head()

In [None]:
lic_df.head()

In [None]:
## DATASET ONE - RESTRICTIONS (Database + Attribution), PUBLIC DOMAIN, NONE
# USE CLASS_WEIGHT = 'BALANCED' PARAMETER IN MODELS

In [None]:
# create new dataframe variable
ds1_pred_df = pred_df

In [None]:
# create dictionary to re-map license types 
ds1_rights_dict = {'Public Domain':'Public Domain', 'None': 'None', 
               'Attribution':'Attribution_SA', 'Attribution_Sharealike':'Attribution_SA'}

In [None]:
# create new license type column in dataframe by mapping license types from dictionary
ds1_pred_df["ds1_license_type"] = ds1_pred_df["license_type"].map(ds1_rights_dict)
ds1_pred_df.ds1_license_type.unique()

In [None]:
# drop original license_type column 
ds1_pred_df = ds1_pred_df.drop(['license_type'], axis = 1)

ds1_pred_df.columns

In [None]:
#shuffle dataframe
ds1_pred_df = shuffle(ds1_pred_df).reset_index(drop = True)

In [None]:
# isolate target variable
ds1_license_label = ds1_pred_df.pop('ds1_license_type')

In [None]:
# split data into train and test sets
ds1_pred_train, ds1_pred_test, ds1_lbl_train, ds1_lbl_test = train_test_split(ds1_pred_df, ds1_license_label, test_size = 0.2)

print(len(ds1_pred_train))
print(len(ds1_pred_test))

In [None]:
# dataset-level features only

In [None]:
# dataset-level features only
# drop city-level features from training and test sets
ds1_nocity_train = ds1_pred_train.drop(['city_population', 'num_city_datasets', 'city_census_rank'], axis = 1)

ds1_nocity_test = ds1_pred_test.drop(['city_population', 'num_city_datasets', 'city_census_rank'], axis = 1)

In [None]:
## DECISION TREE - ALL FEATURES

In [None]:
# create decision tree model
DecTree1 = tree.DecisionTreeClassifier(criterion = 'entropy', class_weight = 'balanced')

# fit model with training data
DecTree1.fit(ds1_pred_train, ds1_lbl_train)

In [None]:
# predict on training data
dt1_tr_predict = DecTree1.predict(ds1_pred_train)

# print scores for training data
print('Decision Tree Scores on Training Data:\n')
printScores(ds1_lbl_train, dt1_tr_predict)

In [None]:
# predict on test data
dt1_test_predict = DecTree1.predict(ds1_pred_test)

# print scores for test data
print('Decision Tree Scores on Test Data:\n')
printScores(ds1_lbl_test, dt1_test_predict)


In [None]:
## RANDOM FOREST - ALL FEATURES

In [None]:
# create random forest classifer using 10 trees and entropy as split criterion
RandFor1 = RandomForestClassifier(criterion = 'entropy', class_weight = 'balanced', n_estimators = 40)

# fit model with training data
RandFor1.fit(ds1_pred_train, ds1_lbl_train)    

In [None]:
# predict on training data
rf1_tr_predict = RandFor1.predict(ds1_pred_train)

# print scores for training data
print('Random Forest Scores on Training Data - 10 Trees:\n')
printScores(ds1_lbl_train, rf1_tr_predict)

In [None]:
# predict on test data
rf1_tst_predict = RandFor1.predict(ds1_pred_test)
    
# print scores for test data
print('Random Forest Scores on Test Data - 10 Trees:\n')
printScores(ds1_lbl_test, rf1_tst_predict)

In [None]:
## EXTRATREES - ALL FEATURES

In [None]:
# create extremely randomized trees classifer using 10 trees and entropy as split criterion
ExTree1 = ExtraTreesClassifier(criterion = 'entropy', class_weight = 'balanced', n_estimators = 40)

# fit model with training data
ExTree1.fit(ds1_pred_train, ds1_lbl_train)       

In [None]:
# predict on training data
ex1_tr_predict = ExTree1.predict(ds1_pred_train)

# print scores for training data
print('ExtraTrees Scores on Training Data - 10 Trees:\n')
printScores(ds1_lbl_train, ex1_tr_predict)

In [None]:
# predict on test data
ex1_tst_predict = ExTree1.predict(ds1_pred_test)
    
# print scores for test data
print('ExtraTrees Scores on Test Data - 10 Trees:\n')
printScores(ds1_lbl_test, ex1_tst_predict)

In [None]:
# FEATURE IMPORTANCES OF BEST MODEL
# best model for Dataset 1 - ExtraTrees

In [None]:
# view importance of predictor variables in ExtraTrees model
ds1_feat_import = ExTree1.feature_importances_
ds1_pred_vars = list(ds1_pred_df.columns)
ds1_feats_ranked = list(zip(ds1_pred_vars, ds1_feat_import))
ds1_feats_ranked.sort(key = lambda tup: tup[1], reverse = True) 
ds1_feats_ranked

In [None]:
# plot feature importances
importances = ExTree1.feature_importances_
std = np.std([ExTree1.feature_importances_ for tree in ExTree1.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
fig1 = plt.figure(figsize = (10, 10))
plt.title("3-Class Dataset - Extremely Randomized Trees - Feature Importances", fontsize = 16)
plt.barh(range(ds1_pred_train.shape[1]), importances[indices],
       color="#29788E", yerr=std[indices], align="center")
plt.yticks(range(ds1_pred_train.shape[1]), [ds1_pred_vars[i] for i in indices], fontsize = 14)
plt.ylim([-1, ds1_pred_train.shape[1]])
plt.show()
fig1.savefig('fig1.jpg', bbox_inches='tight')

In [None]:
# BEST MODEL ON DATASET-LEVEL ONLY FEATURES
# best model for Dataset 1 - ExtraTrees

In [None]:
# create extremely randomized trees classifer using 10 trees and entropy as split criterion
ExTree1A = ExtraTreesClassifier(criterion = 'entropy', class_weight = 'balanced')

# fit model with training data
ExTree1A.fit(ds1_nocity_train, ds1_lbl_train)   

In [None]:
# predict on training data
ex1A_tr_predict = ExTree1A.predict(ds1_nocity_train)

# print scores for training data
print('ExtraTrees Scores on Dataset-Only Features Training Data - 10 Trees:\n')
printScores(ds1_lbl_train, ex1A_tr_predict)

In [None]:
# predict on test data
ex1A_tst_predict = ExTree1A.predict(ds1_nocity_test)
    
# print scores for test data
print('ExtraTrees Scores on Test Data - 10 Trees:\n')
printScores(ds1_lbl_test, ex1A_tst_predict)

In [None]:
# view importance of predictor variables in ExtraTrees Dataset-level only model
ds1NC_feat_import = ExTree1A.feature_importances_
ds1NC_pred_vars = list(ds1_nocity_train.columns)
ds1NC_feats_ranked = list(zip(ds1NC_pred_vars, ds1NC_feat_import))
ds1NC_feats_ranked.sort(key = lambda tup: tup[1], reverse = True) 
ds1NC_feats_ranked

In [None]:
# plot feature importances
importances = ExTree1A.feature_importances_
std = np.std([ExTree1A.feature_importances_ for tree in ExTree1A.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
fig1A = plt.figure(figsize = (10, 10))
plt.title("3-Class Dataset - Extremely Randomized Trees - Dataset-Level Feature Importances", fontsize = 16)
plt.barh(range(ds1_nocity_train.shape[1]), importances[indices],
       color="#FDE724", yerr=std[indices], align="center")
plt.yticks(range(ds1_nocity_train.shape[1]), [ds1NC_pred_vars[i] for i in indices], fontsize = 14)
plt.ylim([-1, ds1_nocity_train.shape[1]])
plt.show()
fig1A.savefig('fig1A.jpg', bbox_inches='tight')

In [None]:
## DATASET TWO - LICENSED (Database + Attribution + Public Domain), NO LICENSE (None)
# check how closely classes balance - ****REMOVE CLASS WEIGHT BALANCE PARAMETER???

In [None]:
# create new dataframe variable
ds2_pred_df = pred_df

In [None]:
# create dictionary to re-map license types 
ds2_rights_dict = {'Public Domain':'License', 'None': 'No_License', 
               'Attribution':'License', 'Attribution_Sharealike':'License'}

In [None]:
# create new license type column in dataframe by mapping license types from dictionary
ds2_pred_df["ds2_license_type"] = ds2_pred_df["license_type"].map(ds2_rights_dict)
ds2_pred_df.ds2_license_type.unique()

In [None]:
# check class balance
### CONSIDER SAMPLING THESE TO BALANCE EXACTLY ###
ds2_pred_df.groupby('ds2_license_type').size()

In [None]:
# drop original license_type column 
ds2_pred_df = ds2_pred_df.drop(['license_type'], axis = 1)

ds2_pred_df.columns

In [None]:
#shuffle dataframe
ds2_pred_df = shuffle(ds2_pred_df).reset_index(drop = True)

In [None]:
# isolate target variable
ds2_license_label = ds2_pred_df.pop('ds2_license_type')

In [None]:
# split data into train and test sets
ds2_pred_train, ds2_pred_test, ds2_lbl_train, ds2_lbl_test = train_test_split(ds2_pred_df, ds2_license_label, test_size = 0.2)

print(len(ds2_pred_train))
print(len(ds2_pred_test))

In [None]:
# dataset-level features only

In [None]:
# drop city-level features from training and test sets
ds2_nocity_train = ds2_pred_train.drop(['city_population', 'num_city_datasets', 'city_census_rank'], axis = 1)

ds2_nocity_test = ds2_pred_test.drop(['city_population', 'num_city_datasets', 'city_census_rank'], axis = 1)

In [None]:
## DECISION TREE - ALL FEATURES

In [None]:
# create decision tree model
DecTree2 = tree.DecisionTreeClassifier(criterion = 'entropy', class_weight = 'balanced')

# fit model with training data
DecTree2.fit(ds2_pred_train, ds2_lbl_train)

In [None]:
# predict on training data
dt2_tr_predict = DecTree2.predict(ds2_pred_train)

# print scores for training data
print('Decision Tree Scores on Training Data:\n')
printScores(ds2_lbl_train, dt2_tr_predict)

In [None]:
# predict on test data
dt2_test_predict = DecTree2.predict(ds2_pred_test)

# print scores for test data
print('Decision Tree Scores on Test Data:\n')
printScores(ds2_lbl_test, dt2_test_predict)


In [None]:
## RANDOM FOREST - ALL FEATURES

In [None]:
# create random forest classifer using 10 trees and entropy as split criterion
RandFor2 = RandomForestClassifier(criterion = 'entropy', class_weight = 'balanced', n_estimators = 40)

# fit model with training data
RandFor2.fit(ds2_pred_train, ds2_lbl_train)  

In [None]:
# predict on training data
rf2_tr_predict = RandFor2.predict(ds2_pred_train)

# print scores for training data
print('Random Forest Scores on Training Data - 10 Trees:\n')
printScores(ds2_lbl_train, rf2_tr_predict)

In [None]:
# predict on test data
rf2_tst_predict = RandFor2.predict(ds2_pred_test)
    
# print scores for test data
print('Random Forest Scores on Test Data - 10 Trees:\n')
printScores(ds2_lbl_test, rf2_tst_predict)

In [None]:
## EXTRATREES - ALL FEATURES

In [None]:
# create extremely randomized trees classifer using 10 trees and entropy as split criterion
ExTree2 = ExtraTreesClassifier(criterion = 'entropy', class_weight = 'balanced', n_estimators = 40)

# fit model with training data
ExTree2.fit(ds2_pred_train, ds2_lbl_train)       

In [None]:
# predict on training data
ex2_tr_predict = ExTree2.predict(ds2_pred_train)

# print scores for training data
print('ExtraTrees Scores on Training Data - 10 Trees:\n')
printScores(ds2_lbl_train, ex2_tr_predict)

In [None]:
# predict on test data
ex2_tst_predict = ExTree2.predict(ds2_pred_test)
    
# print scores for test data
print('ExtraTrees Scores on Test Data - 10 Trees:\n')
printScores(ds2_lbl_test, ex2_tst_predict)

In [None]:
# FEATURE IMPORTANCES OF BEST MODEL
# best model for Dataset 2 - Random Forest

In [None]:
# view importance of predictor variables in Random Forest model
ds2_feat_import = RandFor2.feature_importances_
ds2_pred_vars = list(ds2_pred_df.columns)
ds2_feats_ranked = list(zip(ds2_pred_vars, ds2_feat_import))
ds2_feats_ranked.sort(key = lambda tup: tup[1], reverse = True) 
ds2_feats_ranked

In [None]:
# plot feature importances
importances = RandFor2.feature_importances_
std = np.std([RandFor2.feature_importances_ for tree in RandFor2.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
fig2 = plt.figure(figsize = (10, 10))
plt.title("2-Class Dataset - Random Forest - Feature Importances", fontsize = 16)
plt.barh(range(ds2_pred_train.shape[1]), importances[indices],
       color="#440154", yerr=std[indices], align="center")
plt.yticks(range(ds2_pred_train.shape[1]), [ds2_pred_vars[i] for i in indices], fontsize = 14)
plt.ylim([-1, ds2_pred_train.shape[1]])
plt.show()
fig2.savefig('fig2.jpg', bbox_inches='tight')

In [None]:
# BEST MODEL ON DATASET-LEVEL ONLY FEATURES
# best model for Dataset 2 - Random Forest

In [None]:
# create random forest classifer using 10 trees and entropy as split criterion
RandFor2A = RandomForestClassifier(criterion = 'entropy', class_weight = 'balanced')

# fit model with training data
RandFor2A.fit(ds2_nocity_train, ds2_lbl_train)  

In [None]:
# predict on training data
rf2A_tr_predict = RandFor2A.predict(ds2_nocity_train)

# print scores for training data
print('Random Forest Scores on Training Data - 10 Trees:\n')
printScores(ds2_lbl_train, rf2A_tr_predict)

In [None]:
# predict on test data
rf2A_tst_predict = RandFor2A.predict(ds2_nocity_test)
    
# print scores for test data
print('Random Forest Scores on Test Data - 10 Trees:\n')
printScores(ds2_lbl_test, rf2A_tst_predict)

In [None]:
# view importance of predictor variables in Random Forest model
ds2NC_feat_import = RandFor2A.feature_importances_
ds2NC_pred_vars = list(ds2_nocity_train.columns)
ds2NC_feats_ranked = list(zip(ds2NC_pred_vars, ds2NC_feat_import))
ds2NC_feats_ranked.sort(key = lambda tup: tup[1], reverse = True) 
ds2NC_feats_ranked

In [None]:
# plot feature importances
importances = RandFor2A.feature_importances_
std = np.std([RandFor2A.feature_importances_ for tree in RandFor2A.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
fig2A = plt.figure(figsize = (10, 10))
plt.title("2-Class Dataset - Random Forest - Dataset-Level Feature Importances", fontsize = 16)
plt.barh(range(ds2_nocity_train.shape[1]), importances[indices],
       color="#22A784", yerr=std[indices], align="center")
plt.yticks(range(ds2_nocity_train.shape[1]), [ds2NC_pred_vars[i] for i in indices], fontsize = 14)
plt.ylim([-1, ds2_nocity_train.shape[1]])
plt.show()
fig2A.savefig('fig2A.jpg', bbox_inches='tight')