# Total Drug Cases

*Started running at: 20:30*

## Config
This code imports libraries that are used in the script and defines the variables that will be used in the script, such as the API url, the maximum number of cases to retrieve, the directory locations to save the data and the date range to filter the cases. It also creates the necessary directories if they do not exist. Additionally, it sets options to avoid certain errors in the Pandas library, such as chained assignment and future warnings.

In [11]:
# %pip install lxml -q
# %pip install numpy -q
# %pip install folium -q
# %pip install geopy -q

In [6]:
import csv
import requests
import json
import re
import urllib.request
import os
import re
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import clear_output
import matplotlib.pyplot as plt
from threading import Thread
from datetime import datetime, timedelta
import zipfile
import io
import warnings
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words("dutch")
import time
import numpy as np

In [7]:
query = 'drugs handel'
base_verdicts_url = 'https://uitspraken.rechtspraak.nl/api/zoek?'
complete_case_url = 'https://data.rechtspraak.nl/uitspraken/content?id='
case_count = 15000 ## Amount of cases to retrieve. If you want max, just do an insanely high number, it stops when it fetched all cases. Backend does not accept values like 'max'.
location_initial_dataset = os.getcwd() + '/court case data/' 
location_cases = os.getcwd() + '/court case data/cases/' 
location_charts= os.getcwd() + '/charts/' 
cases_df = pd.DataFrame()
date_format = "%d-%m-%Y"
start_date = datetime.strptime('01-01-1905', date_format)
end_date = datetime.strptime('01-12-2022', date_format)


files = {
        "StartRow": 0,
        "PageSize": case_count,
        "ShouldReturnHighlights":'true',
        "ShouldCountFacets":'true',
        "SortOrder":"Relevance",
        "SearchTerms":[{"Term":query,"Field":"AlleVelden"}],
        "Contentsoorten":[],
        "Rechtsgebieden":[],
        "Instanties":[],
        "DatumPublicatie":[],
        "DatumUitspraak":[],
        "Advanced":{"PublicatieStatus":"AlleenGepubliceerd"},
        "CorrelationId":"9abc658b0ce64f8786992af6965aabc4",
        "Proceduresoorten":[]
    }

# create needed directories
directories = [location_initial_dataset, location_cases, location_charts]

for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        
# Avoiding some errors
pd.options.mode.chained_assignment = None 
warnings.simplefilter(action='ignore', category=FutureWarning)        

## Collecting Relevant Cases Using  Rechtspraak API

This script retrieves court case data from Recchtspraak API and stores the data in a csv file (initial_dataset.csv) and individual txt files which includes the verdict part of the cases. The script has 3 main functions: get_cases(), create_cases_df(), and save_case_text(). The get_cases() function makes a post request to an API with a specific configuration (see config part), filters it by date, and returns a list of cases. The create_cases_df() function takes the list of cases and creates a pandas data frame with specific columns, then saves the data frame to a csv file. The save_case_text() function iterates through the cases in the dataframe (initial_dataset.csv) and makes a get request to the API to retrieve additional details (case text) for each case, then parses and saves the details in individual txt files.

In [8]:
def get_cases(config):
    try:
        response = requests.post(base_verdicts_url, json=config)
        responseJSON = json.loads(response.text)
        results0 = responseJSON['Results']
        results = []
        for item in results0:
            print(item ["TitelEmphasis"].replace(":", "-"))
            if item ["TitelEmphasis"].replace(":", "-") and datetime.strptime(item['Publicatiedatum'], date_format) >= start_date <= end_date:
                results.append(item)
        print(len(results), "records found.")       
        return results
    except urllib.error.HTTPError as err:
        print(err)    

def create_cases_df(cases):
    # create the DataFrame with specified columns
    index = ["id", "verdict_date", "publication_date", "verdict_type", "jurisdiction_type", "inhoudsindicatie"]
    cases_df = pd.DataFrame(columns = index)
    # loop through the cases and append each case as a new row in the DataFrame
    for case in cases:
        curr_case = pd.Series([case["TitelEmphasis"].replace(":", "-"), case["Uitspraakdatum"], case["Publicatiedatum"], case["UitspraakdatumType"], case["Rechtsgebieden"], case["Tekstfragment"]], index = index)
        cases_df = cases_df.append(curr_case, ignore_index=True) 
    # save the DataFrame to a CSV file in the directory './court case data/'
    cases_df.to_csv(location_initial_dataset+'initial_dataset.csv', index=False)
    return cases_df    

def save_case_text(df):
    # Iterate through the cases in the dataframe
    for case in df["id"]:
        try:
            # Make a request to get the case details
            response = requests.get(complete_case_url + case.replace("-", ":"))
            soup = BeautifulSoup(response.text, features="xml")
            uitspraak_xml = None
            # Check if the uitspraak element exists, if not check for conclusie
            if soup.find('uitspraak') is not None:
                uitspraak_xml = soup.find('uitspraak')
            elif soup.find('uitspraak') is None and soup.find('conclusie') is not None:
                uitspraak_xml = soup.find('conclusie')
            # Get the text of the uitspraak or conclusie element and replace commas
            uitspraak = uitspraak_xml.get_text()
            uitspraak = uitspraak.replace(',', '')
            # Replace colons with dashes to use as file name
            parsed_id = case.replace(":", "-")
            # Open a file in the location_cases directory with the parsed_id as the name
            with open(location_cases + parsed_id + ".txt", "w+", encoding='utf-8') as f:
                # Write the uitspraak text to the file
                f.write(uitspraak)
        except Exception as e:
            print(f"Error in saving case {case}. {e}")
    print("Finished")
    
cases = get_cases(files)
cases_df = create_cases_df(cases)
save_case_text(cases_df)

ECLI-NL-RBDHA-2019-9384
ECLI-NL-RBMNE-2014-7289
ECLI-NL-RBUTR-2008-BG6691
ECLI-NL-RBNNE-2013-1693
ECLI-NL-RBAMS-2015-3865
ECLI-NL-RBROT-2014-9162
ECLI-NL-CRVB-2004-AO7398
ECLI-NL-RBNHO-2022-6623
ECLI-NL-GHAMS-2016-1885
ECLI-NL-CRVB-2010-BO3824
ECLI-NL-RBZWB-2015-6574
ECLI-NL-RBDHA-2015-1971
ECLI-NL-RBALM-2012-BV9811
ECLI-NL-CRVB-2006-AW7291
ECLI-NL-HR-2007-BA1113
ECLI-NL-PHR-2007-BA1113
ECLI-NL-GHAMS-2016-4386
ECLI-NL-RBAMS-2019-4296
ECLI-NL-RBMAA-2010-BM8942
ECLI-NL-RBMAA-2010-BM8944
ECLI-NL-RBMAA-2010-BM8946
ECLI-NL-RBLIM-2022-9412
ECLI-NL-RBUTR-2011-BQ0665
ECLI-NL-GHAMS-2019-585
ECLI-NL-RBUTR-2011-BV5349
ECLI-NL-RBUTR-2011-BV5471
ECLI-NL-RBROT-2021-10718
ECLI-NL-RBAMS-2018-2843
ECLI-NL-PHR-2003-AK3618
ECLI-NL-HR-2003-AK3618
ECLI-NL-CRVB-2004-AR3605
ECLI-NL-RBZWB-2022-5151
ECLI-NL-GHAMS-2018-4059
ECLI-NL-RBNNE-2019-167
ECLI-NL-GHAMS-2017-4817
ECLI-NL-RBAMS-2015-3864
ECLI-NL-CRVB-2010-BN3889
ECLI-NL-CRVB-2019-2745
ECLI-NL-GHSHE-2014-683
ECLI-NL-RBLIM-2022-9411
ECLI-NL-CRVB-2003-AO5312

Finished


In [9]:
def clean_string(dirty):
    
    clean_str = dirty.strip()
    clean_str = clean_str.lower()
    
    return clean_str

## Creating dataframe from extracted cases 
This code reads the text files in the directory location_cases and creates a data frame that contains the id of the file (case_id) and the text from the file (case text). It then merges this data frame with an existing data frame read from a CSV file(initial_dataset.csv). It then cleans the data frame by removing the dutch stopwords from the case text. It then calculates the total number of words in the uncleaned data frame and cleaned data frame, calculates the difference between the two, and prints the percentage decrease in the number of words.

In [10]:
dataPath = location_cases
if os.path.isfile(dataPath + ".DS_Store"):
    os.remove(dataPath + ".DS_Store")

verdict_df = pd.DataFrame(columns=["id", "case text"])

for filename in os.listdir(dataPath):
    id = filename.replace('.txt','')
    if id in verdict_df['id'].values:
        print(f"{id} already exists, skipping")
        continue
    with open(os.path.join(dataPath, filename), encoding='utf-8') as f:
        case_text = f.read()
        verdict_df = verdict_df.append({"id": id, "case text": case_text}, ignore_index=True)

cases_df = pd.read_csv(location_initial_dataset+'initial_dataset.csv')
merged_df = cases_df.join(verdict_df.set_index('id'), on='id', how='left')

merged_df["verdict_date"] = pd.to_datetime(merged_df["verdict_date"], dayfirst=True)
merged_df["publication_date"] = pd.to_datetime(merged_df["publication_date"], dayfirst=True)

total_len_uncleaned = 0
for i in range(len(merged_df)):
    currtext = merged_df.iloc[i]["case text"]
    try:
        length = len(currtext.split())
        total_len_uncleaned += length
    except:
        print('Please update the dataset')
        print(merged_df.iloc[i]["id"])
print(f"Total words in uncleaned dataset: {total_len_uncleaned}")

cleaned_df = merged_df.copy()
cleaned_df['case text'] = cleaned_df['case text'].apply(lambda x: ' '.join([word for word in x.split() if word not in dutch_stopwords]))
#print("Done")


total_len_cleaned = 0
for i in range(len(cleaned_df)):
    currtext = cleaned_df.iloc[i]["case text"]
    length = len(currtext.split())
    total_len_cleaned += length

print(f"Original count: {total_len_uncleaned} \n Cleaned count: {total_len_cleaned} \n Difference: {total_len_uncleaned - total_len_cleaned}")
print(f"{round((total_len_uncleaned - total_len_cleaned) / total_len_uncleaned * 100, 2)}% decrease")


Total words in uncleaned dataset: 81276005
Original count: 81276005 
 Cleaned count: 48692535 
 Difference: 32583470
40.09% decrease


## Creating list of cities in the Netherlands and common drug names
This code is used to create a list of cities in the Netherlands by extracting data from a geonames.org website. It then takes the names of these cities and appends them to a list. It also has a list of common drug names that are also used in the script. 

In [98]:
### RUN FROM HERE

In [99]:
# Create a list of cities in the Netherlands
url = "https://download.geonames.org/export/dump/NL.zip"
file_name = "NL.txt"

with zipfile.ZipFile(io.BytesIO(requests.get(url).content)) as archive:
    df = pd.read_csv(archive.open(file_name), sep='\t', header=None, names=['geonameid','name','asciiname','alternatenames','latitude','longitude','feature_class','feature_code','country_code','cc2','admin1_code','admin2_code','admin3_code','admin4_code','population','elevation','dem','timezone','modification_date'])

samall_cities = df[df['feature_code'] == 'PPL']['name'].tolist()
big_cities = df[(df['feature_code'].isin(['PPL','PPLA','PPLA2'])) & (df['population']>=100000)]['name'].tolist()
extra_cities = ['Leeuwarden','s-Gravenhage','Amsterdam', "Den Haag","Assen"]
cities = samall_cities + big_cities + extra_cities
#print(cities)

drugs = ["hennep", "wiet", "mdma", "xtc", "cocaïne", "cocaine", "heroïne", "heroine", "mefedron","4-mmc", "ketamine", "crack", "speed", "hasj",
"hasjiesj", "xtc-pillen", "amfetamine", "4-fluoramfetamine", "ghb", "gbl", "methamfetamine", "ecstasy","Paddo's", "Trimboline", "Poppers", "2CB", "Krokodil", "Fentanyl", "JWH-018", "MDPV", "Meow Meow", "N-Bomb", "Nitrieten", "GHB", "GBL", 
"Zoplicon","Zolpidem","Barbituraten","diamorfine","diazepam","captagon","isosafrol","benzylmethylketon","temazepam","codeine","PMK",
"ethylacetate","34-Methyleendioxymetamfetamine","apaen","psilocybine","MDMA-HCL","aceton","BMK.6","amfetamine.2",
"piperonal","mierenzuur","bmk-olie","psilocine","lidocaïne","methadon","MDMA-poeder","MDEA","manitol","amfetamine/metamfetamine","hash",
"MDMA-kristallen","4-fa","piperonylmethylKeton","hashish","amfetaminepasta","fenacetine","xtc-tabletten","crystal",
"MDMA-pillen","amfetamine-sulfaat","coffeïne","amfetamineolie","34-methyleendioxymethamfetamine","2c-b","meth","coffeïne",
"diacetylmorfine","mdea","4-fmp","benzylmorfine","codeïne","dexamfetamine","diamorfine","metamfetamine","methamfetamine","methamphetmine",
"metamphetamine","methylfenidaat","oxycodon","formic acid","tenamfetamine","piperonal","ethylacetaat","captagon",
"34-Methyleendioxymetamfetamine","formamide","isosafrol","bmk","pmk","aceton","amfetamine-olie",
"formamide","tenamfetamine","isosafrol","piperonal","mdma/xtc","ethylacetaat","captagon","34-Methyleendioxymetamfetamine","bmk-olie",
"formic acid","aceton","methanol"]


month_dict = {"januari": "january", "februari": "february", "maart": "march", "april": "april", "mei": "may", "juni": "june", "juli": "july",
"augustus": "august", "september": "september", "oktober": "october", "november": "november", "december": "december"}

## Analysing Court cases using Regular Expressions

It defines a number of regular expressions to search for specific information in the DataFrame Cleaned_df which include the informatioon about court cases. The code then uses a for loop to iterate over each row of the DataFrame and uses the regular expressions to extract information such as the crime date, birth year, location of the crime, and whether the crime is related to drugs or a criminal organization. The extracted information is then added as new columns to the DataFrame, and saved as final data frame. Additionally, it creates variables to count the number of age under 23, cases with location and drug cases.

In [100]:
# Create an empty DataFrame to store the filtered cases
trafficking_df = pd.DataFrame()

total_age_under_23 = 0
cases_with_location = 0
drug_cases = 0
start_time = time.time()

#### Define regular expressions for finding relevant information

# This pattern searches for the string "TENLASTELEGGING" followed by any number of characters (.*?) 
# then looks for a date in the format "dd Month yyyy" where "Month" can be any 3-9 character month name
crime_date_pattern = re.compile(r'tenlastelegging .*?([0-9]{1,2} [a-zA-Z]{3,9} [0-9]{4})\s', re.IGNORECASE| re.DOTALL)

# This pattern searches for the string "verdachte" followed by any number of characters (.*?) 
# then looks for 4 digits (\d{4})
birth_year_pattern = re.compile(r"verdachte.*?(\d{4})\s", re.IGNORECASE| re.DOTALL)

# This pattern searches for the word "rechtbank" followed by any number of whitespaces (\b\s*)
# and any non-whitespace characters (\S+)
rechtbank_pattern = re.compile(r'\brechtbank\b\s*(\S+)', re.IGNORECASE| re.DOTALL)

# This pattern searches for the word "zittingsplaats" followed by any number of whitespaces (\b\s*)
# and any non-whitespace characters (\S+)
zittingsplaats_pattern = re.compile(r'\bzittingsplaats\b\s*(\S+)', re.IGNORECASE| re.DOTALL)

# This pattern searches for terms such as "criminele organisatie"
criminal_organization_pattern = r'criminele organisatie|georganiseerde criminaliteit|georganiseerde misdaad|crimineel samenwerkingsverband|groepscriminaliteit'

# This pattern searches for the word "gerechtshof" followed by any number of whitespaces (\b\s*)
# and any non-whitespace characters (\S+)
gerechtshof_pattern = re.compile(r'\bgerechtshof\b\s*(\S+)', re.IGNORECASE| re.DOTALL)


# Iterate over rows
for i in range(len(cleaned_df)):
    curr = cleaned_df.iloc[i]
    uitspraak = curr["case text"]
    verdict_date = curr["verdict_date"]
    jurisdiction_type = curr["jurisdiction_type"]
    drug_found = False
    crime_location_found = False
    
    try:
        # Find crime date
        crime_date_match = crime_date_pattern.search(uitspraak)
        crime_date = crime_date_match.group(1)        
        
        # Convert the "crime_date" column to a datetime object and extract the month and year
        for key, value in month_dict.items(): crime_date = crime_date.replace(key, value)
        curr['crime_date'] = crime_date    
        curr["crime_date"] = pd.to_datetime(curr['crime_date'], format='%d %B %Y')
        curr["crime_month"] = curr["crime_date"].month_name()
        curr["crime_year"] = curr["crime_date"].year

        # Convert the "verdict_date" column to a datetime object and extract the month and year        
        curr["verdict_date"] = pd.to_datetime(curr['verdict_date'], format='%d %B %Y')
        curr["verdict_month"] = curr["verdict_date"].month_name()
        curr["verdict_year"] = curr["verdict_date"].year        
        
        # Find birth year
        birth_year_match = birth_year_pattern.search(uitspraak)
        birth_year = birth_year_match.group(1)
        
        #calculates the age of the suspect    
        age = int(curr["crime_year"]) - int(birth_year)
          
        if age >= 10 and age <= 23:
            total_age_under_23 += 1
            
        if not (age >= 10 and age <= 23):
            age = 0
            
        curr["age"] = age 
        
        #Find existence of criminal organization in case text
        organization_search = re.search(criminal_organization_pattern, uitspraak, flags=re.IGNORECASE)
        if organization_search:
            curr["criminal_organization"] = 1
        else:
            curr["criminal_organization"] = 0        
        
        
        # Find rechtbank
        rechtbank_match = rechtbank_pattern.search(uitspraak)
        if rechtbank_match:
            rechtbank = rechtbank_match.group(1)
            if rechtbank.lower().startswith('den'):
                rechtbank = 'Den Haag'
            curr['rechtbank'] = rechtbank  
             
            
        # Find zittingsplaats
        zittingsplaats_match = zittingsplaats_pattern.search(uitspraak)
        if zittingsplaats_match:
            zittingsplaats = zittingsplaats_match.group(1)
            if zittingsplaats in cities: # check if location is in the list of cities
                curr['zittingsplaats'] = zittingsplaats
                
                
        # Find gerechtshof
        gerechtshof_match = gerechtshof_pattern.search(uitspraak)        
        if gerechtshof_match:
            curr['hoger_beroep'] = 1
            gerechtshof = gerechtshof_match.group(1) 
            if gerechtshof.lower().startswith('den'):
                gerechtshof = 'Den Haag'            
            curr['gerechtshof'] = gerechtshof            
        else:
            curr['hoger_beroep'] = 0                 
            
        
        # Find crime location
        for crime_location in cities:
            crime_location_search = re.match(crime_location, uitspraak[crime_date_match.end():])
            if crime_location_search and not crime_location_found:
                curr["crime_location"] = crime_location
                crime_location_found = True
                cases_with_location += 1                
                break       
                 
        # Find drug type in cases
        for drug in drugs:
            drug_search = re.search(drug, uitspraak, flags=re.IGNORECASE)
            if drug_search and not drug_found:
                curr["drug_type"] = drug
                drug_found = True
                drug_cases += 1
                break
                
        # Cases met more than one drug type                
        found_drugs = []
        for drug in drugs:
            drug_search = re.search(drug, uitspraak, flags=re.IGNORECASE)
            if drug_search:
                found_drugs.append(drug)
        curr["drugs"] = found_drugs                    
        curr["case text"] = curr["case text"][:5000] # truncate to 5000 characters
        trafficking_df = trafficking_df.append(curr)
        
    except:
       
        pass

In [101]:
end_time = time.time()
duration = end_time - start_time
duration = timedelta(seconds=duration)
duration = timedelta(seconds=round(duration.total_seconds()))

print(f"Total cases used for the filtering: {len(cleaned_df)}")
print(f"Total cases filterd:  {len(trafficking_df)} cases.")
print("Total cases that has crime location: ", cases_with_location) 
print(f"Total cases that has drug: {drug_cases}")
print(f"Total cases with suspects aged under 23: {total_age_under_23}")
print(f"Duration: ", duration.seconds//60, "minutes and", duration.seconds%60, "seconds")

Total cases used for the filtering: 14676
Total cases filterd:  10171 cases.
Total cases that has crime location:  2602
Total cases that has drug: 7969
Total cases with suspects aged under 23: 990
Duration:  33 minutes and 26 seconds


## Creating the final data set
This code performs several operations on the data frame. First, it removes cases (rows) that has no drugs. Next, it is using the drop() function to remove the 'case text', 'inhoudsindicatie', and 'drugs' columns from the dataframe. Then, it is using the replace() method to replace all occurrences of the strings "wiet", "Hennep", and "hennep/wiet" with the string "hennep" in the "drug_type" column. 

In [102]:
final_df = trafficking_df.copy()

In [103]:
### 
### If there is empty age value, flag minor as 0

final_df['age'] = final_df['age'].replace(0.0, pd.np.nan)
final_df['minor'] = final_df['age'].apply(lambda x: 1 if x < 18 else 0)

In [104]:
final_df['hoger_beroep'] = final_df['hoger_beroep'].astype(int)
final_df['criminal_organization'] = final_df['criminal_organization'].astype(int)

In [105]:
### Drop irrelevant columns and rename column for visualizations

cols_to_drop = ['crime_month', 'crime_year', 'verdict_month', 'verdict_year', 'drug_type']
final_df.drop(columns=cols_to_drop, inplace=True) 
final_df = final_df.rename(columns={'drugs':'drug_type'})
final_df = final_df.rename(columns={'hoger_beroep':'appeal'})

In [106]:
### The date columns were not correctly extracted
### So lets extract them correctly

final_df['crime_date_month'] = pd.to_datetime(final_df['crime_date'], errors='coerce')
final_df['crime_date_month'] = final_df['crime_date_month'].dt.month
final_df['crime_date_year'] = pd.to_datetime(final_df['crime_date'], format='%d-%m-%Y').dt.year

final_df['verdict_date_month'] = pd.to_datetime(final_df['verdict_date'], errors='coerce')
final_df['verdict_date_month'] = final_df['verdict_date'].dt.month
final_df['verdict_date_year'] = pd.to_datetime(final_df['verdict_date'], format='%d-%m-%Y').dt.year

In [107]:
### Rename crime_date_month column to categorical values
### Rename verdict_date_month column to categorical values

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
final_df['crime_date_month'] = final_df['crime_date_month'].map(lambda x: months[x-1])
final_df['verdict_date_month'] = final_df['verdict_date_month'].map(lambda x: months[x-1])

In [108]:
final_df = final_df.reset_index(drop=True)
cols = [
    'id', 
    'jurisdiction_type', 
    'rechtbank',
    'gerechtshof', 
    'appeal',
    'crime_location', 
    'verdict_date_year', 
    'verdict_date_month',
    'crime_date_month', 
    'crime_date_year', 
    'age', 
    'minor',
    'criminal_organization', 
    'drug_type'
]
final_df = final_df.reindex(columns=cols)

In [109]:
### Transform to lower case

final_df['drug_type'] = final_df['drug_type'].apply(lambda x: [i.lower() for i in x])
columns_to_transform = ['rechtbank', 'gerechtshof', 'crime_location', 'verdict_date_month', 'crime_date_month']
for col in columns_to_transform:
    final_df[col] = final_df[col].str.lower()

In [125]:
### Function to remove synonyms
### At the drug list above, we also dropped some irrelevant drug types like 'paracetemol', 'pillen', 'pil', 'caffeine' etc.

def standardize_drugs(df, column_name):
    replace_list = {
        "cocaïne":"cocaine",
        "heroïne":"heroine",
        "metamfetamine":"amfetamines/metamfetamines",
        "methamfetamine":"amfetamines/metamfetamines",
        "methamphetmine":"amfetamines/metamfetamines",
        "crystal":"amfetamines/metamfetamines",
        "speed":"amfetamines/metamfetamines",
        "meth":"amfetamines/metamfetamines",
        "methylfenidaat":"amfetamines/metamfetamines",
        "metamphetamine":"amfetamines/metamfetamines",
        "amfetamine":"amfetamines/metamfetamines",
        "metamphetamine":"amfetamines/metamfetamines",
        "amfetamine-sulfaat":"amfetamines/metamfetamines",
        "amfetamine.2":"amfetamines/metamfetamines",
        "amfetamine.7":"amfetamines/metamfetamines",
        "amfetamine-olie":"amfetamines/metamfetamines",
        "amfetamine/metamfetamine":"amfetamines/metamfetamines",
        "meth":"amfetamines/metamfetamines",
        "amfetamineolie":"amfetamines/metamfetamines",
        "amfetaminepasta":"amfetamines/metamfetamines",
        "34-methyleendioxymetamfetamine":"amfetamines/metamfetamines",
        "34-methyleendioxymethamfetamine":"amfetamines/metamfetamines",
        "mdma-pillen":"mdma",
        "mdma-poeder":"mdma",
        "mdma-kristallen":"mdma",
        "mdma-poeder":"mdma",
        "xtc-pillen":"xtc",
        "xtc-tabletten":"xtc",
        "ecstasy":"xtc",
        "ecstasy-pillen":"xtc",
        "codeïne":"codeine",
        "bmk.6":"bmk",
        "bmk-olie":"bmk",
        "bmk-olie.6":"bmk",
        "hasj":"hasjiesj",
        "hashish":"hasjiesj",
        "hash":"hasjiesj",
        "wiet":"hennep"
        
    }
    df[column_name] = df[column_name].apply(lambda x: list(set([replace_list.get(i, i) for i in x])))
    return df

In [126]:
### Apply function to standardize drugs
final_df = standardize_drugs(final_df, 'drug_type')

### Drop cases that have an empty drugs list
final_df = final_df[final_df['drug_type'].apply(lambda x: len(x) > 0)].reset_index(drop=True)

In [128]:
### Check for duplicates

final_df.groupby('id').size().sort_values(ascending=False).reset_index(name='counts').head()

Unnamed: 0,id,counts
0,ECLI-NL-CBB-2018-187,1
1,ECLI-NL-RBNNE-2016-2297,1
2,ECLI-NL-RBNNE-2016-2847,1
3,ECLI-NL-RBNNE-2016-2747,1
4,ECLI-NL-RBNNE-2016-2654,1


In [129]:
final_df

Unnamed: 0,id,jurisdiction_type,rechtbank,gerechtshof,appeal,crime_location,verdict_date_year,verdict_date_month,crime_date_month,crime_date_year,age,minor,criminal_organization,drug_type
0,ECLI-NL-RBDHA-2019-9384,['Strafrecht'],den haag,,0,,2019,jun,apr,2019,18.0,0,0,"[cocaine, heroine]"
1,ECLI-NL-RBMNE-2014-7289,['Strafrecht; Materieel strafrecht'],midden-nederland,,0,,2014,apr,jan,2012,,0,0,"[krokodil, hennep, 2c-b, mdma, amfetamines/met..."
2,ECLI-NL-RBUTR-2008-BG6691,['Strafrecht'],utrecht,,0,,2008,sep,oct,2007,,0,0,"[cocaine, mdma, xtc]"
3,ECLI-NL-RBNNE-2013-1693,['Strafrecht'],noord-nederland,,0,,2013,feb,sep,2012,,0,0,"[cocaine, aceton]"
4,ECLI-NL-RBAMS-2015-3865,['Strafrecht'],amsterdam,,0,,2015,may,dec,2012,,0,0,"[cocaine, mdma]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7964,ECLI-NL-RBDHA-2016-5209,['Strafrecht'],den haag,den haag,1,,2016,may,apr,2014,20.0,0,1,[hasjiesj]
7965,ECLI-NL-RBNHO-2013-10924,['Strafrecht'],noord-holland,amsterdam,1,,2013,nov,mar,2012,,0,1,"[cocaine, amfetamines/metamfetamines, krokodil]"
7966,ECLI-NL-RBMNE-2021-221,['Strafrecht'],midden-nederland,,0,,2021,jan,dec,2019,,0,0,"[poppers, amfetamines/metamfetamines]"
7967,ECLI-NL-RBAMS-2013-BZ0392,['Strafrecht'],amsterdam,amsterdam,1,,2013,jan,feb,2009,,0,1,[amfetamines/metamfetamines]


### Explode dataframe

In [132]:
### Explode and check unique drug types

final_df_exploded = final_df.explode('drug_type').reset_index(drop=True)
print(final_df_exploded['drug_type'].unique())

['cocaine' 'heroine' 'krokodil' 'hennep' '2c-b' 'mdma'
 'amfetamines/metamfetamines' 'methadon' 'xtc' 'mefedron' 'aceton'
 'diacetylmorfine' 'crack' 'hasjiesj' 'tenamfetamine' 'mdea' 'fenacetine'
 'methanol' 'coffeïne' 'ketamine' 'ghb' 'gbl' 'fentanyl'
 'benzylmethylketon' 'bmk' 'mierenzuur' 'formamide' 'pmk' '4-fa'
 '4-fluoramfetamine' 'formic acid' 'manitol' 'isosafrol' 'diazepam' '2cb'
 'temazepam' 'mdma/xtc' 'ethylacetaat' 'ethylacetate'
 'piperonylmethylketon' 'lidocaïne' '4-mmc' 'piperonal' 'mdma-hcl'
 'codeine' 'oxycodon' 'dexamfetamine' 'zolpidem' 'psilocine' 'psilocybine'
 'poppers' 'captagon' 'barbituraten' 'zoplicon' 'diamorfine' '4-fmp'
 "paddo's"]


### Import juvenile cases

In [115]:
df_jeugdstrafrecht = pd.read_csv("final_df_unique.csv")
df_jeugdstrafrecht = df_jeugdstrafrecht[['id', 'jurisdiction_type']]
df_jeugdstrafrecht                 

Unnamed: 0,id,jurisdiction_type
0,ECLI-NL-RBDHA-2019-9384,['Strafrecht']
1,ECLI-NL-GHAMS-2020-1930,['Strafrecht']
2,ECLI-NL-GHAMS-2022-115,['Strafrecht']
3,ECLI-NL-RBNNE-2023-118,['Strafrecht']
4,ECLI-NL-RBROT-2019-4815,['Strafrecht']
...,...,...
156,ECLI-NL-RBMNE-2020-4495,['Strafrecht']
157,ECLI-NL-RBLIM-2017-3317,['Strafrecht']
158,ECLI-NL-RBOVE-2021-4198,['Strafrecht']
159,ECLI-NL-RBMNE-2020-4872,['Strafrecht']


In [116]:
df_jeugdstrafrecht['jurisdiction_type'].isnull().sum()

0

### Join dataframes

In [118]:
### Left join total cases with juvenile cases
### Empty values are not juvenile cases

df_merged = final_df.merge(df_jeugdstrafrecht, on="id", how="left")

In [119]:
df_merged['jeugdstrafrecht'] = df_merged.apply(lambda x: 1 if pd.notnull(x['jurisdiction_type_y']) else 0, axis=1)

In [120]:
df_merged

Unnamed: 0,id,jurisdiction_type_x,rechtbank,gerechtshof,appeal,crime_location,verdict_date_year,verdict_date_month,crime_date_month,crime_date_year,age,minor,criminal_organization,drug_type,jurisdiction_type_y,jeugdstrafrecht
0,ECLI-NL-RBDHA-2019-9384,['Strafrecht'],den haag,,0,,2019,jun,apr,2019,18.0,0,0,"[cocaine, heroine]",['Strafrecht'],1
1,ECLI-NL-RBMNE-2014-7289,['Strafrecht; Materieel strafrecht'],midden-nederland,,0,,2014,apr,jan,2012,,0,0,"[krokodil, hennep, 2c-b, mdma, amfetamines/met...",,0
2,ECLI-NL-RBUTR-2008-BG6691,['Strafrecht'],utrecht,,0,,2008,sep,oct,2007,,0,0,"[cocaine, mdma, xtc]",,0
3,ECLI-NL-RBNNE-2013-1693,['Strafrecht'],noord-nederland,,0,,2013,feb,sep,2012,,0,0,"[cocaine, aceton]",,0
4,ECLI-NL-RBAMS-2015-3865,['Strafrecht'],amsterdam,,0,,2015,may,dec,2012,,0,0,"[cocaine, mdma]",,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7964,ECLI-NL-RBDHA-2016-5209,['Strafrecht'],den haag,den haag,1,,2016,may,apr,2014,20.0,0,1,[hasjiesj],,0
7965,ECLI-NL-RBNHO-2013-10924,['Strafrecht'],noord-holland,amsterdam,1,,2013,nov,mar,2012,,0,1,"[cocaine, amfetamines/metamfetamines, krokodil]",,0
7966,ECLI-NL-RBMNE-2021-221,['Strafrecht'],midden-nederland,,0,,2021,jan,dec,2019,,0,0,"[poppers, amfetamines/metamfetamines]",,0
7967,ECLI-NL-RBAMS-2013-BZ0392,['Strafrecht'],amsterdam,amsterdam,1,,2013,jan,feb,2009,,0,1,[amfetamines/metamfetamines],,0


In [121]:
df_merged['jeugdstrafrecht'].value_counts()

0    7808
1     161
Name: jeugdstrafrecht, dtype: int64

In [122]:
df_merged.to_csv('df_merged.csv', index=False)