In [781]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn import preprocessing, impute, model_selection, decomposition, cluster, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from statistics import mean
import math

### Import EPA Pollution and Census American Community Survey Data

In [479]:
epa = pd.read_csv('../dataset/epa_pollution.csv')
census = pd.read_csv('../dataset/census_acs.csv')
aqi = pd.read_csv('../dataset/aqi_report.csv')

#only use data with common cbsa codes
common_cbsa = set(census['cbsa_code'])
common_cbsa = common_cbsa.intersection(set(epa['cbsa_code']))
common_cbsa = common_cbsa.intersection(set(aqi['cbsa_code']))

# Standardized cbsa code datasets
epa = epa[epa['cbsa_code'].isin(common_cbsa)].copy()
census = census[census['cbsa_code'].isin(common_cbsa)].copy()
aqi = aqi[aqi['cbsa_code'].isin(common_cbsa)].copy()

## Data Preprocessing

### Census ACS 

In [480]:
# Remove redundant/irrelevant columns
census = census.drop(['metropolitan_area', 'city', 'state'], axis=1)

# Normalize columns
# Income
income = [i for i in census.columns if '$' in i]
for col in income:
    census[col] = census[col]/census['Income_Total']
census.drop('Income_Total', axis=1, inplace=True)

#Education
education = [i for i in census.columns if ('degree' in i) or ('graduate' in i)]
for col in education:
    census[col] = census[col]/census['Education_Total']
census.drop('Education_Total', axis=1, inplace=True)

# Occupation
occupation = [i for i in census.columns if (i not in income) and (i not in education) and (i not in ['cbsa_code', 'year', 'Occupation_Total'])]
for col in occupation:
    census[col] = census[col]/census['Occupation_Total']
census.drop('Occupation_Total', axis=1, inplace=True)

census.head()

Unnamed: 0,cbsa_code,Less than high school graduate,High school graduate (includes equivalency),Some college or associate's degree,Bachelor's degree,Graduate or professional degree,"$45,000 to $49,999","$50,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999",...,"Educational services, and health care and social assistance","Arts, entertainment, and recreation, and accommodation and food services","Other services, except public administration",Public administration,"Management, business, science, and arts occupations",Service occupations,Sales and office occupations,"Natural resources, construction, and maintenance occupations","Production, transportation, and material moving occupations",year
1,10420,0.10959,0.342669,0.266596,0.183831,0.097314,0.043547,0.085722,0.105218,0.116918,...,0.224313,0.087933,0.047411,0.029834,0.341412,0.164147,0.268786,0.00181,0.223845,2005
2,10500,0.21453,0.32363,0.276552,0.115232,0.070055,0.046814,0.09969,0.111882,0.087279,...,0.22143,0.066773,0.066212,0.076664,0.300161,0.138019,0.24635,0.009773,0.305697,2005
3,10580,0.097922,0.308826,0.28455,0.169404,0.139298,0.0462,0.092376,0.1116,0.137878,...,0.25973,0.069128,0.038137,0.110608,0.397041,0.151805,0.270669,0.002259,0.178227,2005
4,10740,0.140155,0.265774,0.293919,0.169377,0.130775,0.04381,0.083659,0.113831,0.10281,...,0.223604,0.094226,0.045499,0.063296,0.374627,0.16671,0.264363,0.002006,0.192294,2005
5,10780,0.176673,0.371108,0.262197,0.136293,0.053729,0.041082,0.072454,0.070793,0.091749,...,0.302493,0.063796,0.038255,0.085138,0.3344,0.17062,0.256644,0.007136,0.231201,2005


In [459]:
census.shape

(4644, 33)

In [460]:
soc_econ = census

#### PCA

Since there are 33 attributes and only 5K instances, it may not be sufficient for a successful clustering analysis.

In [461]:
#Capture 90% variance 
pca = decomposition.PCA(n_components=.90, svd_solver='full')

#only apply PCA on the continious variables 
continuous = census.drop(['cbsa_code', 'year'], axis=1)
reduced_census = pd.DataFrame(pca.fit_transform(continuous))
reduced_census['cbsa_code'] = census['cbsa_code']
reduced_census['year'] = census['year']

census_df = reduced_census.sort_values(['cbsa_code', 'year'], axis=0)
census_df.shape

(4644, 11)

### Clean AQI Report

In [481]:
temp_df = pd.DataFrame()
for year in aqi['year'].unique():
    for cbsa in aqi['cbsa_code'].unique():
        curr = aqi[(aqi['year'] == year) & (aqi['cbsa_code'] == cbsa)]
        if curr.shape[0] > 0:
            temp_df = temp_df.append(curr.iloc[0,:], ignore_index=True)
aqi = temp_df.copy()

In [482]:
# Normalize the days for each air quality with the total AQI days
qualities = ['Good', 'Moderate', 'Unhealthy for Sensitive Groups', 'Unhealthy', 'Very Unhealthy']
for quality in qualities:
    aqi.loc[:, quality] = aqi[quality] / aqi['# Days with AQI']
    
aqi.sort_values(['cbsa_code', 'year'], inplace=True, axis=0)
aqi.drop(columns="# Days with AQI", inplace=True)

### EPA Pollution

Clean up data such that each instance contains all pollutants for each cbsa

In [185]:
durations = ['1 HOUR','24-HR BLK AVG', '24 HOUR', '3-HR BLK AVG', '8-HR RUN AVG END HOUR', '8-HR RUN AVG BEGIN HOUR', '5 MINUTE', 'INTEGRATED PASSIVE 4-WEEKS', 'INTEGREATED PASSIVE 3-WEEKS']
first_duration = {'PM2.5': '1 HOUR',
                  'Ozone': '8-HR RUN AVG END HOUR',
                  'Carbon_monoxide': '1 HOUR',
                  'Sulfur_dioxide': '1 HOUR'}

## Create new pollution DF with all the pollutants for every instance
pollution_df = pd.DataFrame()
for year in range(2005, 2020):
    for code in common_cbsa:
        new_row = {'year': year,
                   'cbsa_code': code}

        #average all records for year and code into one instance
        for k, v in first_duration.items():
            pollutant = pollution[(pollution['year'] == year) & (pollution['cbsa_code'] == code) & (pollution['parameter'].str.contains(k.replace('_', ' ')))]
            avg_poll = pollutant[pollutant['sample_duration'] == v]
            new_row[k + '_sample_duration'] = v
            
            #If no instances matching the sample duration exist, then we look for other
            for duration in durations:
                if avg_poll.shape[0] != 0: break
                avg_poll = pollutant[pollutant['sample_duration'] == duration]
                new_row[k + '_sample_duration'] = duration
            
            if avg_poll.shape[0] == 0: 
                new_row[k + '_sample_duration'] = np.nan
            
            attributes = ['arithmetic_mean', 'standard_deviation', 'ninety_ninth_percentile', 'seventy_fifth_percentile']
            for attr in attributes:
                new_row[k + '_' + attr] = pollutant[attr].mean()

        # Add matching AQI report to index
        aqi_dict = aqi[(aqi['cbsa_code'] == code) & (aqi['year'] == year)]
        for col in aqi_dict.columns:
            if col != 'year' and col != 'cbsa_code':
                new_row[col] = aqi_dict.iloc[0, :][col] if (aqi_dict.shape[0] != 0) else None
                
        pollution_df = pollution_df.append(new_row, ignore_index=True)
        
pollution_df.sort_values(['cbsa_code', 'year'], inplace=True, axis=0)

#### Fill and Impute Missing Values

In [186]:
# Fill nan pollution values with recent year values
for code in common_cbsa:
    pollution_df[pollution_df['cbsa_code'] == code] = pollution_df[pollution_df['cbsa_code'] == code].fillna(method='ffill', axis=0) ## Fill forward to bring last completed year forward
    pollution_df[pollution_df['cbsa_code'] == code] = pollution_df[pollution_df['cbsa_code'] == code].fillna(method='bfill', axis=0) ## Fill backward to impute missing values in earlier years

In [187]:
#Impute sample duration with mode
si = impute.SimpleImputer( strategy='most_frequent')
dur_labels = [col for col in pollution_df.columns if 'duration' in col]
pollution_df[dur_labels] = si.fit_transform(pollution_df[dur_labels])

# Label encode all sample_duration columns
le = preprocessing.LabelEncoder()
le.fit(durations)
for col in dur_labels:
    pollution_df[col] = le.transform(pollution_df[col])

In [188]:
# Fill in remaining values with average
si = impute.SimpleImputer()
pollution_df = pd.DataFrame(si.fit_transform(pollution_df), columns=pollution_df.columns)

In [189]:
pollution_df.sort_values(['cbsa_code', 'year'], inplace=True, axis=0)
pollution_df.to_csv('../dataset/epa_pollution_clean.csv', index=False)

In [190]:
temp_poll = pollution_df.copy()

#### PCA

Since there are 33 attributes and only 5K instances, it may not be sufficient for a successful clustering analysis.

In [191]:
#Capture 90% variance 
pca = decomposition.PCA(n_components=.90, svd_solver='full')

#only apply PCA on the continious variables 
continuous = pollution_df.drop(['cbsa_code', 'year'], axis=1)
continuous = continuous.drop(dur_labels, axis=1)
reduced_epa = pd.DataFrame(pca.fit_transform(continuous))
reduced_epa['cbsa_code'] = pollution_df['cbsa_code']
reduced_epa['year'] = pollution_df['year']
for duration in dur_labels:
    reduced_epa[duration] = pollution_df[duration]

reduced_epa.sort_values(['cbsa_code', 'year'], inplace=True, axis=0)
# reduced_epa.to_csv('../dataset/epa_pollution_clean.csv', index=False)
reduced_epa.shape

(5085, 10)

In [192]:
pca_poll = reduced_epa.copy()

## Symbol Encoding with Clusters

### K-Means clustering on EPA pollution + AQI dataset

In [193]:
kmm = cluster.KMeans(n_clusters=5, random_state=27)
kmm.fit_predict(reduced_epa.drop(columns=['year', 'cbsa_code']))
reduced_epa['cluster'] = kmm.labels_
reduced_epa.to_csv('../dataset/epa_aqi_clustered.csv', index=False)



### K-Means clustering on AQI

In [793]:
kmm = cluster.KMeans(n_clusters=5, random_state=27)
kmm.fit_predict(aqi.drop(columns=['cbsa_code', 'year']))
aqi['cluster'] = kmm.labels_

aqi.to_csv('../dataset/aqi_clustered.csv', index=False)

### K-Means clustering on ACS

In [756]:
kmm = cluster.KMeans(n_clusters=5, random_state=27)
kmm.fit_predict(census.drop(columns=['cbsa_code', 'year']))
census['cluster'] = kmm.labels_
census.to_csv('../dataset/census_clustered.csv', index=False)

### Encode Clusters with appropriate values

In [773]:
def cluster_encode(cluster_num, data):
    census_code = [2,0,4,1,3]
    aqi_code = [2,0,1,4,3]
    
    if data == 'aqi':
        return aqi_code.index(cluster_num)
    elif data == 'census':
        return census_code.index(cluster_num)

In [800]:
cluster_df = pd.DataFrame()
# Create rows for all years and cbsa
for year in range(2005, 2020):
    for code in common_cbsa:
        #define new row
        new_row = {'year': year,
                   'cbsa_code': code}
        
        #find the rows that match the year and cbsa
        curr_census = census[(census['year'] == year) & (census['cbsa_code'] == code)] 
        new_row['census_cluster'] = cluster_encode(list(curr_census['cluster'])[0], 'census') if curr_census.shape[0] == 1 else np.nan
        curr_aqi = aqi[(aqi['year'] == year) & (aqi['cbsa_code'] == code)]
        new_row['aqi_cluster'] = cluster_encode(list(curr_aqi['cluster'])[0], 'aqi') if curr_aqi.shape[0] == 1 else np.nan
        
        # add new row 
        cluster_df = cluster_df.append(new_row, ignore_index=True)
        
    #Impute missing values
    cluster_df = cluster_df.fillna(method='bfill', axis=0, limit=3)
    cluster_df = cluster_df.fillna(method='ffill', axis=0)

cluster_df.sort_values(['cbsa_code', 'year'], inplace=True, axis=0) #sort the values
print(cluster_df.isna().isna().isna().sum())
print(cluster_df.shape)
print(len(cluster_df['cbsa_code'].unique()))

year              0
cbsa_code         0
census_cluster    0
aqi_cluster       0
dtype: int64
(5085, 4)
339


In [801]:
cluster_df['census_cluster'].value_counts()

0.0    1814
3.0    1036
4.0     982
2.0     944
1.0     309
Name: census_cluster, dtype: int64

### Cross Correlation

In [634]:
# Time lagged cross correlation
def crosscorr(datax, datay, lag=0):
    return datax.corr(datay.shift(lag))

In [802]:
lag_corr = dict()
for lag in range(-13, 13):
    correlation = dict()
    for cbsa in common_cbsa:
        df = cluster_df[cluster_df['cbsa_code'] == cbsa].drop(columns=['cbsa_code', 'year'])

        correlation[cbsa] = crosscorr( df['aqi_cluster'], df['census_cluster'], lag)
    
    lag_corr[lag] = mean([i for i in correlation.values() if not math.isnan(i)])
    
lag_corr = {k:v for k,v in sorted(lag_corr.items(), key=lambda item:item[1])}
lag_corr

{-10: -0.10340172158307664,
 1: -0.032723469608016184,
 -9: -0.026522274720328105,
 0: -0.0217434902430734,
 -2: -0.015653989957039267,
 12: -0.011811389781538374,
 -8: -0.008384093128256468,
 2: -0.003077366686860921,
 -3: -0.0011783142980342151,
 -7: 0.005172611259769706,
 4: 0.006682357615811409,
 3: 0.007588122455517431,
 -4: 0.010406259550498488,
 -5: 0.014370421503685109,
 -1: 0.017576555497538192,
 7: 0.021491551954450277,
 11: 0.023925142824821097,
 5: 0.03097266025935444,
 -6: 0.04321715855146443,
 -11: 0.046166495463699005,
 6: 0.053461696278899316,
 -12: 0.07873990276066896,
 8: 0.09657682903660737,
 10: 0.12555932999571667,
 9: 0.136801074988874,
 -13: 0.4285714285714286}

### Explore clusters

In [775]:
cluster_info = pd.DataFrame()
for clus in aqi['cluster'].unique():
    new_row = dict()
    cluster_aqi = aqi[aqi['cluster'] == clus]
    for col in cluster_aqi:
        new_row[col] = cluster_aqi[col].mean()
        
    cluster_info = cluster_info.append(new_row, ignore_index=True)
    
cluster_info.sort_values(['Unhealthy for Sensitive Groups'], inplace=True, axis=0)
cluster_info

Unnamed: 0,Good,Moderate,Unhealthy for Sensitive Groups,Unhealthy,Very Unhealthy,AQI Maximum,AQI 90th Percentile,AQI Median,# Days CO,# Days NO2,# Days O3,# Days SO2,# Days PM2.5,# Days PM10,year,cbsa_code,cluster
1,0.699155,0.286527,0.011439,0.002606,0.000273,120.769962,62.813688,36.792776,0.967681,0.519011,21.813688,4.13308,320.241445,5.577947,2012.070342,30231.444867,2.0
3,0.753406,0.219386,0.024271,0.002749,0.000187,128.579793,68.100459,41.432836,1.262342,6.883467,259.964409,5.516073,53.484501,8.85132,2012.035591,30397.370838,0.0
2,0.59523,0.35746,0.038428,0.00816,0.000722,143.806859,77.790839,47.321525,0.396661,6.148014,155.857852,7.468863,187.527076,4.204874,2012.453971,30102.423285,4.0
0,0.652223,0.285117,0.054541,0.007777,0.000342,126.627404,72.855769,40.378606,3.384615,9.680288,48.644231,80.346154,88.680288,29.134615,2008.997596,29324.855769,1.0
4,0.225852,0.568959,0.137554,0.038139,0.029496,1007.307692,123.269231,72.269231,0.346154,10.769231,164.115385,0.076923,64.192308,125.730769,2012.192308,34967.692308,3.0


In [734]:
cluster_info = pd.DataFrame()
for clus in census['cluster'].unique():
    new_row = dict()
    cluster_census = census[census['cluster'] == clus]
    for col in cluster_census:
        new_row[col] = cluster_census[col].mean()
        
    cluster_info = cluster_info.append(new_row, ignore_index=True)
    
    
cluster_info.sort_values(['High school graduate (includes equivalency)'], inplace=True, axis=0)
cluster_info.drop(columns=['cbsa_code', 'year']).to_csv('census_cluster_info.csv', index=True)
cluster_info

Unnamed: 0,cbsa_code,Less than high school graduate,High school graduate (includes equivalency),Some college or associate's degree,Bachelor's degree,Graduate or professional degree,"$45,000 to $49,999","$50,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999",...,"Arts, entertainment, and recreation, and accommodation and food services","Other services, except public administration",Public administration,"Management, business, science, and arts occupations",Service occupations,Sales and office occupations,"Natural resources, construction, and maintenance occupations","Production, transportation, and material moving occupations",year,cluster
2,28582.244898,0.092537,0.230318,0.280643,0.232734,0.163767,0.03862,0.0759,0.100341,0.128229,...,0.096571,0.046662,0.049281,0.4283,0.172628,0.223991,0.065903,0.109178,2014.081633,2.0
3,33721.168385,0.273572,0.264938,0.294457,0.111885,0.055148,0.044199,0.079744,0.09963,0.109902,...,0.08671,0.048177,0.060712,0.265044,0.198824,0.231895,0.137741,0.166496,2012.426117,0.0
0,29914.238579,0.131619,0.289382,0.307854,0.174953,0.096192,0.046819,0.087202,0.10751,0.120726,...,0.095813,0.048139,0.053099,0.339805,0.174857,0.263583,0.00633,0.215426,2007.035533,4.0
4,29812.558685,0.115474,0.29599,0.324053,0.168603,0.09588,0.044087,0.082841,0.104323,0.121537,...,0.105109,0.048771,0.054602,0.341188,0.192154,0.242771,0.096926,0.126961,2014.271714,1.0
1,31342.064516,0.146841,0.366009,0.285859,0.131489,0.069801,0.047109,0.087962,0.107527,0.117636,...,0.083494,0.047885,0.038689,0.294937,0.173324,0.240301,0.053154,0.238284,2010.790323,3.0


In [704]:
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

In [705]:
similarity = dict()
for cbsa in common_cbsa:
    df = cluster_df[cluster_df['cbsa_code'] == cbsa]
    
    distance, path = fastdtw(df['census_cluster'], df['aqi_cluster'])
    similarity[cbsa] = distance
mean([i for i in similarity.values() if not math.isnan(i)]) 

16.224188790560472

In [706]:
from scipy.stats import spearmanr
covariance = dict()
for cbsa in common_cbsa:
    df = cluster_df[cluster_df['cbsa_code'] == cbsa].drop(columns=['cbsa_code', 'year'])
    
    covariance[cbsa], _ = spearmanr(df['census_cluster'], df['aqi_cluster'])
    
mean([i for i in covariance.values() if not math.isnan(i)])



0.12731013367534358

#### Cluster Info

In [735]:
cluster_info.to_csv('test.csv')

In [609]:
cluster_order = list()
for col in cluster_info.columns:
    cluster_info.sort_values([col], inplace=True, axis=0)
       
    if list(cluster_info['cluster'].copy()) not in cluster_order:
        cluster_order.append(list(cluster_info['cluster'].copy()))
    
cluster_order


[[2.0, 1.0, 4.0, 3.0, 0.0],
 [2.0, 0.0, 4.0, 1.0, 3.0],
 [2.0, 3.0, 0.0, 4.0, 1.0],
 [0.0, 3.0, 1.0, 4.0, 2.0],
 [2.0, 1.0, 0.0, 4.0, 3.0],
 [2.0, 0.0, 1.0, 4.0, 3.0],
 [0.0, 2.0, 1.0, 4.0, 3.0],
 [0.0, 3.0, 4.0, 1.0, 2.0],
 [3.0, 0.0, 4.0, 1.0, 2.0],
 [3.0, 4.0, 0.0, 1.0, 2.0],
 [2.0, 4.0, 3.0, 1.0, 0.0],
 [2.0, 3.0, 1.0, 0.0, 4.0],
 [0.0, 2.0, 4.0, 1.0, 3.0],
 [2.0, 1.0, 3.0, 4.0, 0.0],
 [2.0, 0.0, 3.0, 4.0, 1.0],
 [2.0, 4.0, 1.0, 3.0, 0.0],
 [0.0, 3.0, 1.0, 2.0, 4.0],
 [3.0, 0.0, 1.0, 4.0, 2.0],
 [0.0, 4.0, 3.0, 1.0, 2.0],
 [3.0, 0.0, 4.0, 2.0, 1.0],
 [2.0, 3.0, 4.0, 0.0, 1.0],
 [3.0, 2.0, 4.0, 1.0, 0.0],
 [2.0, 3.0, 4.0, 1.0, 0.0],
 [2.0, 0.0, 3.0, 1.0, 4.0],
 [4.0, 3.0, 2.0, 1.0, 0.0],
 [4.0, 3.0, 0.0, 2.0, 1.0],
 [0.0, 1.0, 2.0, 3.0, 4.0]]

## Classifiers

In [787]:
def k_fold_validate(folds, model, features, target, model_type, output=True):
    kf = model_selection.KFold(n_splits=folds, shuffle=True, random_state=3)
    
    rec, prec, f1 = [], [], []
    imp_df = pd.DataFrame()
    for train_index, test_index in kf.split(features):
        # Retrieve the train and test sets
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        # Standardize the data to optimize performance
        scaler = preprocessing.StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        # Fit the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Evaluate the model
        rec += [metrics.recall_score(y_pred, y_test, average='weighted')]
        prec += [metrics.precision_score(y_pred, y_test, average='weighted')]
        f1 += [metrics.f1_score(y_pred, y_test, average='weighted')]
        
        # Store the feature importances for the fold
        if model_type == 'lrm':
            imps = abs(model.coef_[0])
        elif model_type == 'dtm' or model_type == 'rfm':
            imps = model.feature_importances_
        elif model_type == 'nbm':
            imps = inspection.permutation_importance(model, X_test, y_test).importances_mean
          
        imps = {k:v for k,v in enumerate(imps)}
        imp_df = imp_df.append(imps, ignore_index=True)
        
    if output:
        print("recall    = {:.4f} ±{:.4f} {}".format(np.mean(rec), np.std(rec), rec))
        print("precision = {:.4f} ±{:.4f} {}".format(np.mean(prec), np.std(prec), prec))
        print("f1        = {:.4f} ±{:.4f} {}".format(np.mean(f1), np.std(f1), f1))
    
    scores = {'recall': [np.mean(rec), np.std(rec)],
              'precision': [np.mean(prec), np.std(prec)],
              'f1': [np.mean(f1), np.std(f1)],
              'feature_ranks': {k:v for k, v in enumerate(imp_df.mean(axis=0))}            
             }        
        
    return scores

In [803]:
# Create features and target 
X = census.drop(columns=['cluster'])
target = 'aqi_cluster'
y = cluster_df[[target, 'year', 'cbsa_code']]

In [804]:
# Create rows for all years and cbsa
for year in range(2005, 2020):
    for code in common_cbsa:
        #define new row
        new_row = {'year': year,
                   'cbsa_code': code}
        
        
        #find the rows that match the year and cbsa
        curr_census = X[(X['year'] == year) & (X['cbsa_code'] == code)] 
        if curr_census.shape[0] == 0:
            new_row.update({k:None for k in X.columns})
            X = X.append(new_row, ignore_index=True)
            
        curr_aqi = y[(y['year'] == year) & (y['cbsa_code'] == code)]
        if curr_aqi.shape[0] == 0:
            new_row[target] = None
            y = y.append(new_row, ignore_index=True)
        
        
    # #Impute missing values
    # cluster_df = cluster_df.fillna(method='bfill', axis=1, limit=3)
    # cluster_df = cluster_df.fillna(method='ffill', axis=1)

X.sort_values(['cbsa_code', 'year'], inplace=True, axis=0) #sort the values
y.sort_values(['cbsa_code', 'year'], inplace=True, axis=0) #sort the values

data = X.copy()
data[target] = y[target]

# Drop na values
data = data.dropna()

# Drop irrelevant columns
X = data.drop(columns=['year', target])
y = data[target]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [805]:
X.shape

(4644, 32)

In [807]:
y.value_counts()

3.0    2102
1.0    1605
0.0     500
2.0     413
4.0      24
Name: aqi_cluster, dtype: int64

In [772]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.3, random_state=17, shuffle=True)
# Small Dataset
lrm = LogisticRegression()
lrm.fit(X_train, y_train)
lrm.score(X_test, y_test)

ValueError: Unknown label type: 'continuous'

### Decision Tree

In [808]:
# Small Dataset
dtm = DecisionTreeClassifier()
scores = k_fold_validate(5, dtm, X, y, 'dtm')

recall    = 0.3553 ±0.0141 [0.34660925726587727, 0.3326157158234661, 0.37029063509149623, 0.35844994617868675, 0.36853448275862066]
precision = 0.3514 ±0.0104 [0.34964910790631365, 0.3334426909842384, 0.36476715457681336, 0.35135074668437277, 0.357609714516184]
f1        = 0.3529 ±0.0121 [0.3478227805070351, 0.33260934531060743, 0.3673406687368896, 0.35468095488785667, 0.36229428007212]


### Random Forest

In [118]:
# Small Dataset
rfm = RandomForestClassifier()
scores = k_fold_validate(5, rfm, Xs, ys, 'rfm')
og_small_score['rfm'] = scores['f1'][0]
imp_small = imp_small.append(scores['feature_ranks'], ignore_index=True)

recall    = 0.1099 ±0.0321 [0.08536585365853659, 0.07608695652173914, 0.16666666666666666, 0.12048192771084337, 0.10112359550561797]
precision = 0.1040 ±0.0148 [0.08235294117647059, 0.08974358974358974, 0.11458333333333333, 0.11627906976744186, 0.11688311688311688]
f1        = 0.1058 ±0.0205 [0.08383233532934131, 0.08235294117647059, 0.13580246913580246, 0.1183431952662722, 0.10843373493975904]


### Naive Bayes

In [120]:
# Small Dataset
nbm = GaussianNB()
scores = k_fold_validate(5, nbm, Xs, ys, 'nbm')
og_small_score['nbm'] = scores['f1'][0]
imp_small = imp_small.append(scores['feature_ranks'], ignore_index=True)

recall    = 0.3375 ±0.1281 [0.45454545454545453, 0.14285714285714285, 0.49206349206349204, 0.34146341463414637, 0.2564102564102564]
precision = 0.3783 ±0.2863 [0.7647058823529411, 0.02564102564102564, 0.6458333333333334, 0.32558139534883723, 0.12987012987012986]
f1        = 0.3356 ±0.2082 [0.5701754385964911, 0.043478260869565216, 0.5585585585585586, 0.33333333333333337, 0.17241379310344826]
