# In-Depth Analysis of Region Data

Beginning by importing modules necessary for machine learning analysis.

In [42]:
# import modules and libraries
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
c_south = pd.read_csv('Clean_Data/clean_south.csv')
c_midwest = pd.read_csv('Clean_Data/clean_midwest.csv')
c_midwest = c_midwest.rename(columns={'Midwestern?':'Midwestern'})
ohe = OneHotEncoder()
mlb = MultiLabelBinarizer()

In [3]:
south=c_south[['Unnamed: 0', 'Southern?']]

In [4]:
#solving issue where Southern? column is in string format and not an object.
ast.literal_eval(south['Southern?'][0])

#turn each item in the southern? series into a list
empty = []
for item in south['Southern?']:
    
    py_lst = ast.literal_eval(item)
    empty.append(py_lst)

#turning the list back into a series
list_southern = pd.Series(empty)
c_south = c_south.rename(columns={'Southern?':'Southern'})
c_south = c_south.assign(Southern=list_southern)

In [82]:
#Creating columns with binary values for if a state or region was selected for each respondent
mlb.fit_transform(c_south['Southern'])
mlb.classes_

southern_binary = pd.DataFrame(mlb.fit_transform(c_south['Southern']), columns=mlb.classes_)
cr_binary = pd.get_dummies(c_south['Census Region'])
south_deg_ident=c_south[['Degree of ident.']]
svc_df = south_deg_ident.join(southern_binary)
svc_df = svc_df.join(cr_binary)

#Dataframe for SVC containing region, states voted, and degree of ident
svc_df.head()

Unnamed: 0,Degree of ident.,Alabama,Arizona,Arkansas,Colorado,Delaware,Florida,Georgia,Illinois,Indiana,...,West Virginia,East North Central,East South Central,Middle Atlantic,Mountain,New England,Pacific,South Atlantic,West North Central,West South Central
0,Some,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Not much,1,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,Not at all,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,A lot,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Not at all,1,0,1,0,0,1,1,0,0,...,1,0,0,0,0,0,0,1,0,0


In [6]:
#creating x's and y's to train models on.
ys = svc_df['Degree of ident.']
xs = svc_df.drop('Degree of ident.', axis=1)

#train_test_split for southern dataset
xtrain_s, xtest_s, ytrain_s, ytest_s = train_test_split(xs, ys, test_size=0.3, random_state=42)

svc = SVC()

In [7]:
#Need to do some cross validation with gridsearchcv to ensure hyperparameters are tuned correctly.
#likely there would be overfitting without it.
#setting param_grid for 'linear', 'rbf', and 'sigmoid' style kernels to see if other kernels work better.
param_grid = [{'C':[1, 10, 100, 1000],
               'kernel': ['linear']},
             {'C':[1, 10, 100, 1000],
              'gamma': [0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']},
             {'C':[1, 10, 100, 1000],
              'gamma': [0.1, 0.01, 0.001, 0.0001],
              'coef0': np.logspace(-3,3,4),
              'kernel': ['sigmoid']}]
svc_cv = GridSearchCV(svc, param_grid, cv=3)

svc_cv.fit(xtrain_s, ytrain_s)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000],
                          'gamma': [0.1, 0.01, 0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000],
                          'coef0': array([1.e-03, 1.e-01, 1.e+01, 1.e+03]),
                          'gamma': [0.1, 0.01, 0.001, 0.0001],
                          'kernel': ['sigmoid']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=Fals

In [8]:
#show which parameters are the best for southern dataset.
svc_cv.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [9]:
#get classification report for southern dataset
cv_pred = svc_cv.predict(xtest_s)

print(classification_report(ytest_s, cv_pred))

              precision    recall  f1-score   support

       A lot       0.42      0.72      0.53       169
  Not at all       0.70      0.66      0.68       257
    Not much       0.31      0.14      0.19       111
        Some       0.24      0.16      0.19       142

    accuracy                           0.49       679
   macro avg       0.42      0.42      0.40       679
weighted avg       0.47      0.49      0.46       679



Using the support vector machine algorithm for the southern dataset, we are able to get a predictive model for with about 49% accuracy.

### Midwest Dataset SVC


In [10]:
#turn each item in the midwestern? series into a list
midwest_st = []
for item in c_midwest['Midwestern']:
    
    py_lst = ast.literal_eval(item)
    midwest_st.append(py_lst)

In [11]:
#turning the list back into a series
list_midwest = pd.Series(midwest_st)
c_midwest= c_midwest.assign(Midwestern=list_midwest)

#breaking down midwestern states votes and census region into binary values.
midwestern_binary = pd.DataFrame(mlb.fit_transform(c_midwest['Midwestern']), columns=mlb.classes_)
cr_binary_m = pd.get_dummies(c_midwest['Census Region'])
midwest_deg_ident=c_midwest[['Degree of ident.']]

#creating a single dataframe with degree of ident, census region, and midwestern states for the SVC.
svc_df_m = midwest_deg_ident.join(midwestern_binary)
svc_df_m = svc_df_m.join(cr_binary_m)

In [81]:
svc_df_m.head()

Unnamed: 0,Degree of ident.,Arkansas,Colorado,Illinois,Indiana,Iowa,Kansas,Kentucky,Michigan,Minnesota,...,Wyoming,East North Central,East South Central,Middle Atlantic,Mountain,New England,Pacific,South Atlantic,West North Central,West South Central
0,Not much,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Not much,1,0,1,0,1,0,0,1,1,...,0,1,0,0,0,0,0,0,0,0
2,A lot,0,0,1,1,0,0,0,1,1,...,0,1,0,0,0,0,0,0,0,0
3,A lot,0,0,1,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,Some,0,0,1,1,1,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0


In [13]:
#creating x's and y's to train models on for svc_df_m.
ym = svc_df_m['Degree of ident.']
xm = svc_df_m.drop('Degree of ident.', axis=1)

#train_test_split for southern dataset
xtrain_m, xtest_m, ytrain_m, ytest_m = train_test_split(xm, ym, test_size=0.3, random_state=42)

In [17]:
#Gridsearch for optimal parameters using same parameters as above
param_grid = [{'C':[0.1, 1, 10, 100, 1000],
               'kernel': ['linear']},
             {'C':[0.1, 1, 10, 100, 1000],
              'gamma': [0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']},
             {'C':[0.1, 1, 10, 100, 1000],
              'gamma': [0.1, 0.01, 0.001, 0.0001],
              'coef0': np.logspace(-3,3,4),
              'kernel': ['sigmoid']}]
svc_cv = GridSearchCV(svc, param_grid, cv=3)

svc_cv.fit(xtrain_m, ytrain_m)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [0.1, 1, 10, 100, 1000],
                          'gamma': [0.1, 0.01, 0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [0.1, 1, 10, 100, 1000],
                          'coef0': array([1.e-03, 1.e-01, 1.e+01, 1.e+03]),
                          'gamma': [0.1, 0.01, 0.001, 0.0001],
                          'kernel': ['sigmoid']}],
             pre_dispatch='2*n_jobs', refit=True, return_t

In [18]:
#Show which parameters are the best
svc_cv.best_params_

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

In [19]:
#Get classification report
cv_pred_m = svc_cv.predict(xtest_m)

print(classification_report(ytest_m, cv_pred_m))

              precision    recall  f1-score   support

       A lot       0.52      0.83      0.64       211
  Not at all       0.68      0.92      0.78       289
    Not much       0.00      0.00      0.00        87
        Some       0.00      0.00      0.00       142

    accuracy                           0.61       729
   macro avg       0.30      0.44      0.36       729
weighted avg       0.42      0.61      0.50       729



  'precision', 'predicted', average, warn_for)


Here it appears that the 'Not much' and 'Some' labels for the midwest dataset do not produce any precision or recall values. This is interesting considering these are more ambiguious states of self-identification rather than those that really identify as being a southerner (with the "A lot" label) and those that do not (with the "Not at all" label). It is then with this model of Support Vector Classifier that we are able to get a 61% accuracy.

# Random Forest Model
#### Running random forest models for the south and midwestern dataset to see if these yield better results.

### South Dataset: Random Forest

In [33]:
#creating a dataframe for the random forest classifier for the southern data, spliiting all categorical data into
#binary values and columns.
rf_df_s = svc_df.join(pd.get_dummies(c_south['Gender']))
rf_df_s = rf_df_s.join(pd.get_dummies(c_south['Age Range']))
rf_df_s = rf_df_s.join(pd.get_dummies(c_south['Income']))
rf_df_s = rf_df_s.join(pd.get_dummies(c_south['Education']))
rf_df_s.columns

Index(['Degree of ident.', 'Alabama', 'Arizona', 'Arkansas', 'Colorado',
       'Delaware', 'Florida', 'Georgia', 'Illinois', 'Indiana', 'Kansas',
       'Kentucky', 'Louisiana', 'Maryland', 'Mississippi', 'Missouri',
       'New Mexico', 'North Carolina', 'Ohio', 'Oklahoma', 'Pennsylvania',
       'South Carolina', 'Tennessee', 'Texas', 'Virginia', 'West Virginia',
       'East North Central', 'East South Central', 'Middle Atlantic',
       'Mountain', 'New England', 'Pacific', 'South Atlantic',
       'West North Central', 'West South Central', 'Female', 'Male', '18-29',
       '30-44', '45-60', '> 60', '$0 - $24,999', '$100,000 - $149,999',
       '$150,000+', '$25,000 - $49,999', '$50,000 - $99,999',
       'Associate or bachelor degree', 'Graduate degree', 'High school degree',
       'Less than high school degree', 'Some college'],
      dtype='object')

In [37]:
#splitting labeled data and features.
ys_rf = rf_df_s['Degree of ident.']
xs_rf = rf_df_s.drop('Degree of ident.', axis=1)

#split training and testing data for random forest classifier
xtrain_rf_s, xtest_rf_s, ytrain_rf_s, ytest_rf_s = train_test_split(xs_rf, ys_rf, test_size=0.3, random_state=42)

#calling model
rfc = RandomForestClassifier(n_estimators=100)

rfc.fit(xtrain_rf_s, ytrain_rf_s)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
trainscore = rfc.score(xtrain_rf_s, ytrain_rf_s)
testscore = rfc.score(xtest_rf_s, ytest_rf_s)
print(f'Training score:{trainscore}')
print(f'Test score:{testscore}')

Training score:0.9968394437420987
Test score:0.4742268041237113


In [46]:
#will try to tune hyperparameters to see if can get better test score accuracy
max_depth =[int(x) for x in np.linspace(10,110, num=11)]
max_depth.append(None)
random_grid = {'n_estimators' : [int(x) for x in np.linspace(200, 2000, num=10)],
               'max_features' : ['auto','sqrt'],
               'max_depth' : max_depth,
               'min_samples_split' : [2,5,10],
               'min_samples_leaf' : [1,2,4],
               'bootstrap':[True, False]}

random_search = RandomizedSearchCV(rfc, random_grid, cv=3)

#train random search
random_search.fit(xtrain_rf_s, ytrain_rf_s)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=100,
                                                    n_jobs=None,
 

In [47]:
random_search.best_params_

{'n_estimators': 800,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': True}

In [68]:
random_forest_grid = {
                      'n_estimators' : [700,800,900,1000],
                      'max_depth' : [20,30,40,50],
                      'min_samples_split' : [8,10,12],
                      'min_samples_leaf' : [1,2,3],
                      'bootstrap':[True]}

rfc_cv = GridSearchCV(rfc, random_forest_grid, cv=3, n_jobs=-1, verbose=2)

rfc_cv.fit(xtrain_rf_s, ytrain_rf_s)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  2.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='w

In [69]:
rfc_cv.best_params_

{'bootstrap': True,
 'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 8,
 'n_estimators': 700}

In [70]:
cv_rf_pred= rfc_cv.predict(xtest_rf_s)

print(classification_report(ytest_rf_s, cv_rf_pred))

              precision    recall  f1-score   support

       A lot       0.45      0.59      0.51       169
  Not at all       0.64      0.75      0.69       257
    Not much       0.29      0.10      0.15       111
        Some       0.32      0.27      0.29       142

    accuracy                           0.51       679
   macro avg       0.43      0.43      0.41       679
weighted avg       0.47      0.51      0.48       679



We can see from the classification report that the accuracy for the south dataset increased to 51% from our max with the support vector machine of 49%. Considering that we were still able to get so close to the 51% accuracy with fewer features goes to show that many of the features from this dataset are not as powerful as the actual census region people are from and which states they voted for as southern.

### Midwest Dataset: Random Forest

In [71]:
#creating a dataframe for the random forest classifier for the midwest data.
rf_df_m = svc_df_m.join(pd.get_dummies(c_midwest['Gender']))
rf_df_m = rf_df_m.join(pd.get_dummies(c_midwest['Age Range']))
rf_df_m = rf_df_m.join(pd.get_dummies(c_midwest['Income']))
rf_df_m = rf_df_m.join(pd.get_dummies(c_midwest['Education']))
rf_df_m.columns

Index(['Degree of ident.', 'Arkansas', 'Colorado', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Michigan', 'Minnesota', 'Missouri',
       'Montana', 'Nebraska', 'North Dakota', 'Ohio', 'Oklahoma',
       'Pennsylvania', 'South Dakota', 'West Virginia', 'Wisconsin', 'Wyoming',
       'East North Central', 'East South Central', 'Middle Atlantic',
       'Mountain', 'New England', 'Pacific', 'South Atlantic',
       'West North Central', 'West South Central', 'Female', 'Male', '18-29',
       '30-44', '45-60', '> 60', '$0 - $24,999', '$100,000 - $149,999',
       '$150,000+', '$25,000 - $49,999', '$50,000 - $99,999',
       'Associate or bachelor degree', 'Graduate degree', 'High school degree',
       'Less than high school degree', 'Some college'],
      dtype='object')

In [72]:
#splitting labeled data and features.
ym_rf = rf_df_m['Degree of ident.']
xm_rf = rf_df_m.drop('Degree of ident.', axis=1)

#split training and testing data for random forest classifier
xtrain_rf_m, xtest_rf_m, ytrain_rf_m, ytest_rf_m = train_test_split(xm_rf, ym_rf, test_size=0.3, random_state=42)

In [73]:
#fitting random search grid to find approximate best parameters for random forest model

random_search.fit(xtrain_rf_m, ytrain_rf_m)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=100,
                                                    n_jobs=None,
 

In [74]:
random_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': False}

In [78]:
random_forest_grid = {
                      'n_estimators' : [900,1000,1100,1200],
                      'max_depth' : [10,20,30],
                      'min_samples_split' : [8,10,12],
                      'min_samples_leaf' : [1,2,3],
                      'bootstrap':[True,False]}

rfc_cv = GridSearchCV(rfc, random_forest_grid, cv=3, n_jobs=-1, verbose=2)

rfc_cv.fit(xtrain_rf_m, ytrain_rf_m)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed:  4.4min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='w

In [79]:
rfc_cv.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'min_samples_leaf': 3,
 'min_samples_split': 12,
 'n_estimators': 1200}

In [80]:
cv_rf_pred=rfc_cv.predict(xtest_rf_m)

print(classification_report(ytest_rf_m, cv_rf_pred))

              precision    recall  f1-score   support

       A lot       0.55      0.82      0.65       211
  Not at all       0.67      0.94      0.78       289
    Not much       0.00      0.00      0.00        87
        Some       0.14      0.01      0.01       142

    accuracy                           0.61       729
   macro avg       0.34      0.44      0.36       729
weighted avg       0.45      0.61      0.50       729



  'precision', 'predicted', average, warn_for)


Here we again come across the issue of the label of 'not much' yeilding a 0 in precision and recall. While the accruacy still remains close to the 61% that we had reached with the Support Vector Classifier. Yet, we can see that we were able to achieve a f1-score for the those that identified as 'Some' as being considered a midwesterner. This does not mean that it is a much better predictor than our support vector machine, but it is able to measure some qualities for the "Some" label.