In [None]:
import pandas as _hex_pandas
import datetime as _hex_datetime
import json as _hex_json

In [None]:
hex_scheduled = _hex_json.loads("false")

In [None]:
hex_user_email = _hex_json.loads("\"example-user@example.com\"")

In [None]:
hex_run_context = _hex_json.loads("\"logic\"")

In [None]:
hex_timezone = _hex_json.loads("\"US/Eastern\"")

In [None]:
hex_project_id = _hex_json.loads("\"9f93c61b-31ae-4882-b203-f1885dbc6f8f\"")

In [None]:
hex_project_name = _hex_json.loads("\"Human Trafficking Predictions_preprocessing\"")

In [None]:
hex_status = _hex_json.loads("\"\"")

In [None]:
hex_categories = _hex_json.loads("[]")

In [None]:
hex_color_palette = _hex_json.loads("[\"#4C78A8\",\"#F58518\",\"#E45756\",\"#72B7B2\",\"#54A24B\",\"#EECA3B\",\"#B279A2\",\"#FF9DA6\",\"#9D755D\",\"#BAB0AC\"]")

In [None]:
import pandas as pd
import numpy as np
import altair as alt

## Variables from the CTDC data
- gender
- agebroad
- citizenship
- CountryOfExploitation (for training)


In [None]:
data = pd.read_csv("Trafficking_Data.csv")
data.shape[0]

193025

In [None]:
156616/193025


0.8113767646677892

In [None]:
#Importing the data and fixing some columns typing then narrowing down to variables we want
data = data.drop(["majorityStatusAtExploit"], axis=1)
data = data[data["gender"].notna()]
data = data[data["yearOfRegistration"].notna()]
data = data[data["citizenship"].notna()]
data = data[data["CountryOfExploitation"].notna()]
data['yearOfRegistration'] = data['yearOfRegistration'].astype(int)
data = data[['yearOfRegistration','citizenship', 'CountryOfExploitation', 'ageBroad', 'gender']]

In [None]:
data

Unnamed: 0,yearOfRegistration,citizenship,CountryOfExploitation,ageBroad,gender
0,2015,UKR,RUS,30--38,Male
1,2015,UKR,RUS,30--38,Male
2,2015,UKR,RUS,30--38,Male
3,2015,UKR,RUS,30--38,Male
4,2015,UKR,RUS,30--38,Male
...,...,...,...,...,...
159078,2021,MMR,BGD,,Male
159079,2021,SEN,MRT,,Male
159080,2021,UZB,UZB,,Male
159081,2021,UZB,UZB,,Male


In [None]:
ratio = data[data['citizenship'] == data['CountryOfExploitation']]
len(ratio)/len(data)

0.4012194787003214

## HTI Variables
 - tier 
    - how complient a company is with trafficking regulations
 - internal 
    - how likely a country is to have internal trafficking
 - enforcement 
    - how much effort a country puts into finding victims
 - protectprogress & victimservices
    - how far a country has come to protecting its citizens from trafficking
 

In [None]:
# importing HTI data and selecting columns we need
data2 = pd.read_stata("HTI_v1.dta", convert_categoricals=False)
data2 = data2[['country', 'year', 'tier', 'internal', 
    'enforcement', 'protectprogress', 'victimservices']]
data2

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  data2 = pd.read_stata("HTI_v1.dta", convert_categoricals=False)


Unnamed: 0,country,year,tier,internal,enforcement,protectprogress,victimservices
0,United States,2001,-99.0,0,0,1,0
1,United States,2002,-99.0,0,0,0,0
2,United States,2003,-99.0,0,2,1,-1
3,United States,2004,-99.0,0,2,1,-1
4,United States,2005,-99.0,0,2,1,-1
...,...,...,...,...,...,...,...
2658,Micronesia,2013,2.0,1,-1,-1,-1
2659,Micronesia,2014,2.0,1,1,-1,1
2660,Micronesia,2015,2.0,1,1,-1,1
2661,Micronesia,2016,2.0,1,1,-1,1


In [None]:
# importing country code data since the two tables use different country identifiers
country_codes = pd.read_csv('countries_codes_and_coordinates.csv')
country_codes = country_codes[['Country', 'Alpha-3 code']]
country_codes['Alpha-3 code'] = country_codes['Alpha-3 code'].str.replace('"', '')
country_codes['Alpha-3 code'] = country_codes['Alpha-3 code'].str.strip()

In [None]:
data = pd.merge(
    data, country_codes, how="left", left_on="citizenship", right_on="Alpha-3 code"
)
data = data.drop(columns="Alpha-3 code")

In [None]:
data = pd.merge(
    data,
    data2,
    left_on=["Country", "yearOfRegistration"],
    right_on=["country", "year"],
    how="right",
)
data = data.drop(columns=["year", "country"])

In [None]:
data.loc[data["ageBroad"].isna(), "ageBroad"] = 0
# Have to deal with the other variables being NA so we have to find out a number to distinguish them
data

Unnamed: 0,yearOfRegistration,citizenship,CountryOfExploitation,ageBroad,gender,Country,tier,internal,enforcement,protectprogress,victimservices
0,,,,0,,,-99.0,0,0,1,0
1,,,,0,,,-99.0,0,0,0,0
2,,,,0,,,-99.0,0,2,1,-1
3,,,,0,,,-99.0,0,2,1,-1
4,,,,0,,,-99.0,0,2,1,-1
...,...,...,...,...,...,...,...,...,...,...,...
28453,,,,0,,,2.0,1,-1,-1,-1
28454,,,,0,,,2.0,1,1,-1,1
28455,,,,0,,,2.0,1,1,-1,1
28456,,,,0,,,2.0,1,1,-1,1


In [None]:
'''
So if we cut off our data at 2017 where the other data ends we lose abt 10k rows.
I say we keep it off for the case of policy changes and we just highlight in our paper that 
the HTI data only being up to 2017 constrained us to this year time frame.
Dr. Frank also advised me this would be best course of action due to policy changes being
a significant cause of trafficking influx.
'''
data = data[data['yearOfRegistration'] <= 2017]
data

Unnamed: 0,yearOfRegistration,citizenship,CountryOfExploitation,ageBroad,gender,Country,tier,internal,enforcement,protectprogress,victimservices
13,2015.0,USA,USA,0,Female,United States,1.0,1,2,1,-1
14,2015.0,USA,USA,0,Female,United States,1.0,1,2,1,-1
15,2015.0,USA,USA,0,Female,United States,1.0,1,2,1,-1
16,2015.0,USA,USA,0,Female,United States,1.0,1,2,1,-1
17,2015.0,USA,USA,0,Female,United States,1.0,1,2,1,-1
...,...,...,...,...,...,...,...,...,...,...,...
28343,2017.0,IDN,HKG,0,Female,Indonesia,2.0,1,1,1,-1
28344,2017.0,IDN,HKG,0,Female,Indonesia,2.0,1,1,1,-1
28345,2017.0,IDN,HKG,0,Female,Indonesia,2.0,1,1,1,-1
28346,2017.0,IDN,HKG,0,Female,Indonesia,2.0,1,1,1,-1


In [None]:
# Taking transgender out due to low frequency and fact of definition being different among countries
data = data[data['gender'] != 'Transgender/NonConforming']
# Converting numbers to be more readable for ML
age_conversion = {
    '0':0,
    '0--8':1,
    '09--17':2,
    '18--20':3,
    '21--23':4,
    '24--26':5,
    '27--29':6,
    '30--38':7,
    '39--47':8,
    '48+':9
}

gender_conversion = {
    'Male':0,
    'Female':1
}

for key in age_conversion:
    data = data.replace(key,age_conversion[key])

for key in gender_conversion:
    data = data.replace(key, gender_conversion[key])

In [None]:
data = data.drop("Country", axis=1)

In [None]:
data

Unnamed: 0,yearOfRegistration,citizenship,CountryOfExploitation,ageBroad,gender,tier,internal,enforcement,protectprogress,victimservices
13,2015.0,USA,USA,0,1,1.0,1,2,1,-1
14,2015.0,USA,USA,0,1,1.0,1,2,1,-1
15,2015.0,USA,USA,0,1,1.0,1,2,1,-1
16,2015.0,USA,USA,0,1,1.0,1,2,1,-1
17,2015.0,USA,USA,0,1,1.0,1,2,1,-1
...,...,...,...,...,...,...,...,...,...,...
28343,2017.0,IDN,HKG,0,1,2.0,1,1,1,-1
28344,2017.0,IDN,HKG,0,1,2.0,1,1,1,-1
28345,2017.0,IDN,HKG,0,1,2.0,1,1,1,-1
28346,2017.0,IDN,HKG,0,1,2.0,1,1,1,-1


In [None]:
data.to_csv('Trafficking_cleaned.csv', index=False)