# Set up 

## Check Environment 

In [82]:
import boto3

region = boto3.Session().region_name
session = boto3.session.Session()

ec2 = boto3.Session().client(service_name="ec2", region_name=region)
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Update IAM Roles and Policies

In [83]:
import sagemaker
import time
from time import gmtime, strftime

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

In [84]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: LabRole


## Import Libraries

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
! pip install descartes
! pip install geopandas
!pip install wordcloud
import geopandas as gpd
from tqdm import tqdm  
from geopandas import GeoDataFrame, points_from_xy
from wordcloud import WordCloud

[0m

In [86]:
s3_public_path = "s3://sagemaker-studio-458903497716-h2kl4ff3dz/data"
s3_public_path_1="s3://sagemaker-studio-458903497716-h2kl4ff3dz/census_2010_data"
s3_public_path_2="s3://sagemaker-studio-458903497716-h2kl4ff3dz/state_abbrev_data"
s3_public_path_3="s3://sagemaker-studio-458903497716-h2kl4ff3dz/congress_data"
s3_public_path_4="s3://sagemaker-studio-458903497716-h2kl4ff3dz/state_income_data"
s3_public_path_5="s3://sagemaker-studio-458903497716-h2kl4ff3dz/cities_data"
s3_public_path_6="s3://sagemaker-studio-458903497716-h2kl4ff3dz/zipcodes"
s3_public_path_7="s3://sagemaker-studio-458903497716-h2kl4ff3dz/adjusted_gross_incomes"
s3_public_path_7="s3://sagemaker-studio-458903497716-h2kl4ff3dz/temp"

In [87]:
%store s3_public_path
%store s3_public_path_1
%store s3_public_path_2
%store s3_public_path_3
%store s3_public_path_4
%store s3_public_path_5
%store s3_public_path_6
%store s3_public_path_7
%store s3_public_path_8

Stored 's3_public_path' (str)
Stored 's3_public_path_1' (str)
Stored 's3_public_path_2' (str)
Stored 's3_public_path_3' (str)
Stored 's3_public_path_4' (str)
Stored 's3_public_path_5' (str)
Stored 's3_public_path_6' (str)
Stored 's3_public_path_7' (str)


UsageError: Unknown variable 's3_public_path_8'


In [None]:
s3_private_path = "s3://{}/gun_violence_data".format(bucket)
s3_private_path_1 = "s3://{}/census2010_data".format(bucket)
s3_private_path_2 = "s3://{}/state_abbrev_data".format(bucket)
s3_private_path_3= "s3://{}/congress_data".format(bucket)
s3_private_path_4= "s3://{}/state_income_data".format(bucket)
s3_private_path_5= "s3://{}/cities_data".format(bucket)
s3_private_path_6= "s3://{}/zipcodes".format(bucket)
s3_private_path_7= "s3://{}/adjusted_gross_incomes".format(bucket)
s3_private_path_8= "s3://{}/temp".format(bucket)

print(s3_private_path)
print(s3_private_path_1)
print(s3_private_path_2)
print(s3_private_path_3)
print(s3_private_path_4)
print(s3_private_path_5)
print(s3_private_path_6)
print(s3_private_path_7)
print(s3_private_path_8)

In [None]:
%store s3_private_path
%store s3_private_path_1
%store s3_private_path_2
%store s3_private_path_3
%store s3_private_path_4
%store s3_private_path_5
%store s3_private_path_6
%store s3_private_path_7
%store s3_private_path_8

In [None]:
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --include "*" 
!aws s3 cp --recursive $s3_public_path_1/ $s3_private_path_1/ --include "*"
!aws s3 cp --recursive $s3_public_path_2/ $s3_private_path_2/ --include "*" 
!aws s3 cp --recursive $s3_public_path_3/ $s3_private_path_3/ --include "*" --acl bucket-owner-full-control
!aws s3 cp --recursive $s3_public_path_4/ $s3_private_path_4/ --include "*" 
!aws s3 cp --recursive $s3_public_path_5/ $s3_private_path_5/ --include "*" 
!aws s3 cp --recursive $s3_public_path_6/ $s3_private_path_6/ --include "*" 
!aws s3 cp --recursive $s3_public_path_7/ $s3_private_path_7/ --include "*" 
!aws s3 cp --recursive $s3_public_path_8/ $s3_private_path_8/ --include "*" 

# Create Database Schema in Athena

In [None]:
import boto3
import sagemaker

sess = sagemaker.Session()
#bucket = '{}/gunData'.format(bucket)
bucket = '{}'.format(bucket)
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket

In [None]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

In [None]:
#assign database name
database_name = "ads508"

In [None]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [None]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [None]:
#create new database 'ads508'
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
create_db = pd.read_sql(statement, conn)
create_db

In [None]:
#verify database creation
q = "SHOW DATABASES"
db_show = pd.read_sql(q, conn)
db_show

## Create Athena Tables

In [None]:
# Set Athena parameters
database_name = "ads508"
table_name_csv = "gun_violence"
s3_path = "s3://{}/gun_violence_data".format(bucket)
print(s3_path)

In [None]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         incident_id int,
         date date,
         state string,
         city_or_county string,
         address string,
         n_killed int,
         n_injured int,
         incident_url string,
         source_url string,
         incident_url_fields_missing string,
         congressional_district int,
         gun_stolen string,
         gun_type string,
         incident_characteristics string,
         latitude int,
         location_description string,
         longitude int,
         n_guns_involved int,
         notes string,
         participant_age string,
         participant_age_group string,
         participant_gender string,
         participant_name string,
         participant_relationship string,
         participant_status string,
         participant_type string,
         sources string,
         state_house_district int,
         state_senate_district int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_path
)

create_table = pd.read_sql(statement, conn)
create_table

In [None]:
table_name_csv_1 = "est2018"
s3_path_1 = "s3://{}/census2010_data".format(bucket)
print(s3_path_1)

In [None]:
# SQL statement to execute

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    sumlev int,
    state int,
    county int,
    place int,
    cousub int,
    concit int,
    primgeo_flag int,
    funcstat int,
    name string,
    stname string,
    census2010pop int,
    estimatesbase2010 int,
    popestimates2010 int,
    popestimates2011 int,
    popestimates2012 int,
    popestimates2013 int,
    popestimates2014 int,
    popestimates2015 int,
    popestimates2016 int,
    popestimates2017 int,
    popestimates2018 int
         
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv_1, s3_path_1
)

create_table_1 = pd.read_sql(statement, conn)
create_table_1

In [None]:
table_name_csv_2 = "state_abbrev"
s3_path_2 = "s3://{}/state_abbrev_data".format(bucket)
print(s3_path_2)

In [None]:
# SQL statement to execute

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
   state string,
   abbrev string,
   code string
         
) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv_2, s3_path_2
)

create_table_2 = pd.read_sql(statement, conn)
create_table_2

In [None]:
table_name_csv_3 = "congress_2013_to_2018"
s3_path_3 = "s3://{}/congress_data".format(bucket)
print(s3_path_3)

In [None]:
# SQL statement to execute

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
 id string,
 name string,
 sort_name string,
 email string,
 twitter string,
 facebook string,
 group string,
 group_id string,
 state string,
 district int,
 chamber string,
 year int,
 start_date date,
 end_date date,
 image string,
 gender string,
 wikidata string,
 wikidata_group string,
 wikidata_area string
 
         
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv_3, s3_path_3
)

create_table_3 = pd.read_sql(statement, conn)
create_table_3

In [None]:
table_name_csv_4 = "state_income"
s3_path_4 = "s3://{}/state_income_data".format(bucket)
print(s3_path_4)

In [None]:
# SQL statement to execute

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
 id int,
 zipcode int,
 agi int,
 avg_agi int,
 groups string,
 avg_income int
 
         
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv_4, s3_path_4
)

create_table_4 = pd.read_sql(statement, conn)
create_table_4

In [None]:
table_name_csv_5 = "cities"
s3_path_5 = "s3://{}/cities_data".format(bucket)
print(s3_path_5)

In [None]:
# SQL statement to execute

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    city string,
    city_ascii string,
    state_id string,
    state_name string,
    county_fips int,
    county_name string,
    lat string,
    lng string,
    population string,
    density string,
    source string,
    military string,
    incorporated string,
    timezone string,
    ranking int,
    zips string,
    id int)
    
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.OpenCSVSerde' 
WITH SERDEPROPERTIES ( 
  'quoteChar'='\"', 
  'separatorChar'=',') 
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv_5, s3_path_5
)

create_table_5 = pd.read_sql(statement, conn)
create_table_5

In [None]:
statement = "SHOW TABLES in ads508"
tables = pd.read_sql(statement, conn)
tables

In [None]:
statement = "SELECT * from ads508.gun_violence LIMIT 100"
df = pd.read_sql(statement, conn)
df.head(5)

In [None]:
statement = "SELECT * from ads508.est2018 LIMIT 100"
df_pop = pd.read_sql(statement, conn)
df_pop.head(5)

In [None]:
statement = "SELECT * from ads508.congress_2013_to_2018 LIMIT 100"
df_cong = pd.read_sql(statement, conn)
df_cong.head(5)

In [None]:
statement = "SELECT * from ads508.state_abbrev LIMIT 100"
df_abbrev = pd.read_sql(statement, conn)
df_abbrev.head(5)

In [None]:
statement = "SELECT * from ads508.state_income LIMIT 100"
df_income = pd.read_sql(statement, conn)
df_income.head(5)

In [None]:
statement = "SELECT * from ads508.cities LIMIT 100"
df_cities = pd.read_sql(statement, conn)
df_cities.head(5)

# Insert Dataset into Pandas Dataframe

In [140]:
#df = pd.read_csv("{}/modeling/notebook2.csv".format(s3_path))
import pandas as pd
#df = pd.read_csv("https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/data_for_modeling.csv")
df = pd.read_csv("s3://sagemaker-us-east-1-898900188658/raw_files/train/training_set_backup-2.csv")
#df = pd.read_csv("s3://sagemaker-studio-458903497716-h2kl4ff3dz/modeling/data_for_modeling.csv")
df.head()

Unnamed: 0.2,Unnamed: 0,target_class,Unnamed: 0.1,latitude,longitude,n_guns_involved,group_Democrat,ohe_drug,ohe_officer,ohe_gang,...,suspect_age_group_Adult,suspect_age_group_Senior,region_East South Central,region_Middle Atlantic,region_Mountain,region_New England,region_Pacific,region_South Atlantic,region_West North Central,region_West South Central
0,0,1,0,40.3467,-79.8559,1.0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,1,1,40.4555,-79.897,1.0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2,1,2,33.909,-118.333,1.0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,3,1,3,33.8447,-118.307,1.0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,1,4,33.9454,-118.399,1.0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [141]:
#Divide train set by .7, test set by .15, and valid set .15
from sklearn.model_selection import train_test_split

size_train = 30500
size_valid = 6536
size_test = 6536
size_total = size_test + size_valid + size_train
train, test = train_test_split(df, train_size = size_train,\
                               random_state = 777)
valid, test = train_test_split(test, train_size = size_valid,\
                               random_state = 777)

print('Training size:', size_train)
print('Validation size:', size_valid)
print('Test size:', size_test)
print('Total size:', size_train + size_valid + size_test)
print('Training percentage:', round(size_train/(size_total),2))
print('Validation percentage:', round(size_valid/(size_total),2))
print('Test percentage:', round(size_test/(size_total),2))

Training size: 30500
Validation size: 6536
Test size: 6536
Total size: 43572
Training percentage: 0.7
Validation percentage: 0.15
Test percentage: 0.15


In [142]:
# Define IAM role
import boto3, re, sys, math, json, os, sagemaker, urllib.request
import io
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()

# set the region of the instance
my_region = boto3.session.Session().region_name 

# this line automatically looks for the XGBoost image URI and 
# builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", 
                                                  my_region, 
                                                  "latest")

print("Success - the MySageMakerInstance is in the " + my_region + \
      " region. You will use the " + xgboost_container + \
      " container for your SageMaker endpoint.")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [143]:
train, test = np.split(df.sample(frac=1, random_state=777), 
                                 [int(0.7 * len(df))])
print(train.shape, test.shape)

(154242, 42) (66104, 42)


In [150]:
train.drop(columns=train.columns[0], axis=1,  inplace=True)
print(train.head(5))

        target_class  Unnamed: 0.1  latitude  longitude  n_guns_involved  \
80490              1         80490   41.3156   -72.8972              1.0   
6157               1          6157   40.8586   -73.8911              1.0   
67175              0         67175   39.7533   -84.1558              2.0   
123310             1        123310   40.6759   -89.6261              1.0   
148005             1        148005   44.9512   -69.4184              1.0   

        group_Democrat  ohe_drug  ohe_officer  ohe_gang  ohe_accident  ...  \
80490                1         0            0         0             0  ...   
6157                 1         0            0         0             0  ...   
67175                0         0            1         0             0  ...   
123310               1         0            0         0             0  ...   
148005               0         0            0         0             1  ...   

        suspect_age_group_Adult  suspect_age_group_Senior  \
80490        

In [151]:
s3_client = boto3.client("s3")
BUCKET='sagemaker-us-east-1-898900188658'
KEY='raw_files/train/training_set_backup-2.csv'
response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

with io.StringIO() as csv_buffer:
    train.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

In [152]:
# input training parameters
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=\
        's3://{}/raw_files/train'.format(bucket), content_type='csv')

In [153]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/output'.format(bucket),
                                    sagemaker_session=sess)
# parse in the hyperparameters
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,
                        subsample=0.8,silent=0,
                        objective='binary:logistic',num_round=100)

In [123]:
df_temp = pd.read_csv("s3://sagemaker-us-east-1-898900188658/raw_files/train/training_set_backup-2.csv", skiprows=1)
df_temp.to_csv("s3://sagemaker-us-east-1-898900188658/raw_files/train/training_set_backup-2.csv")
print(df_temp.columns)
print(s3_input_train)

Index(['0', '1', '0.1', '40.3467', '-79.8559', '1.0', '1.1', '0.2', '0.3',
       '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '1.2', '0.10', '0.11', '0.12',
       '1.3', '0.13', '0.14', '0.15', '0.16', '0.17', '0.18', '0.19', '1.4',
       '0.20', '0.21', '0.22', '1.5', '0.23', '0.24', '0.25', '1.6', '0.26',
       '0.27', '0.28', '0.29', '0.30', '0.31'],
      dtype='object')
<sagemaker.inputs.TrainingInput object at 0x7f6a3e645850>


In [154]:
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2023-04-08-07-20-38-376


2023-04-08 07:20:38 Starting - Starting the training job...
2023-04-08 07:20:54 Starting - Preparing the instances for training...
2023-04-08 07:21:42 Downloading - Downloading input data......
2023-04-08 07:22:22 Training - Downloading the training image...
2023-04-08 07:22:58 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2023-04-08:07:23:02:INFO] Running standalone xgboost training.[0m
[34m[2023-04-08:07:23:02:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-04-08:07:23:02:INFO] File size need to be processed in the node: 14.9mb. Available memory size in the node: 404.21mb[0m
[34m[2023-04-08:07:23:02:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:23:02] S3DistributionType set as FullyReplicated[0m
[34m[07:23:02] 154242x40 matrix with 6169680 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[07:23:03] src/tree/updater_prune.cc:74: tree pruning end, 

In [155]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m5.large')

INFO:sagemaker:Creating model with name: xgboost-2023-04-08-07-24-23-809
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-04-08-07-24-23-809
INFO:sagemaker:Creating endpoint with name xgboost-2023-04-08-07-24-23-809


----!

In [None]:
from sagemaker.serializers import CSVSerializer

# load the data into an array
test_array = test.drop(['targ'], axis=1).values

# set the serializer type
xgb_predictor.serializer = CSVSerializer() 

# predict!
predictions = xgb_predictor.predict(test_array).decode('utf-8') 

# and turn the prediction into an array
predictions_array = np.fromstring(predictions[1:], sep=',') 
print(predictions_array.shape)
