In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("google_scraped.csv")

In [3]:
def get_address(all_text):
    try:
        parts = all_text.split("\n")
        for part in parts:
            if "Address:" in part:
                return part.replace("Address:","").strip()
    except:
        return None
    
def get_opened(all_text):
    try:
        parts = all_text.split("\n")
        for part in parts:
            if "Opened:" in part:
                return part.replace("Opened:","").strip()
    except:
        return None
    
def get_capacity(all_text):
    try:
        parts = all_text.split("\n")
        for part in parts:
            if "Capacity:" in part:
                return part.replace("Capacity:","").strip()
    except:
        return None

def get_male_female(all_text):
    try:
        all_text = all_text.lower()
        response = []
        if " male" in all_text or " men" in all_text:
            response.append("male")

        if "female" in all_text or "women" in all_text:
            response.append("female")

        if "yoi" in all_text or "young offender" in all_text:
            response.append("yoi")

        return ", ".join(response)
    except:
        return ""

def get_category(all_text):
    cats = []
    try:
        all_text = all_text.lower()
        
        if "open prison" in all_text or "open category" in all_text:
            cats.append("open")
            
        a = re.search(r"category (\w/\w |\w ?& ?\w |\w )", all_text)
        
        if a:
            cats.append(a.group(1))
        
        return ", ".join(cats)
    except:
        return ""
    
def get_postcode(address_string):
    """
    Takes an address and returns the postcode, or None if no postcode is found.
    """
    try:
        address_string = address_string.upper()
        pc_regex = "([A-PR-UWYZ]([1-9]([0-9]|[A-HJKSTUW])?|[A-HK-Y][1-9]([0-9]|[ABEHMNPRVWXY])?) *[0-9][ABD-HJLNP-UW-Z]{2}|GIR *0AA)"
        matches = re.search(pc_regex, address_string)

        if matches:
            return matches.group(1)
        else:
            return None
    except:
        return None

In [4]:
df["gscraped_address"] = df["data"].apply(get_address)
df["gscraped_opened"] = df["data"].apply(get_opened)
df["gscraped_capacity"] = df["data"].apply(get_capacity)
df["gscraped_type"] = df["data"].apply(get_male_female)
df["gscraped_cat"] = df["data"].apply(get_category)
df["gscraped_postcode"] = df["gscraped_address"].apply(get_postcode)

In [5]:
# Get postcode and geocode :-)
import psycopg2
con_string = "host='localhost' dbname='postgres' user='postgres' password=''"
conn = psycopg2.connect(con_string)
cursor = conn.cursor()

sql = """
select
ST_X(ST_TRANSFORM(geom, 4326)) as lng,
ST_Y(ST_TRANSFORM(geom,4326)) as lat,
postcode
from all_addresses
where postcode = '{}'
"""


for r in df.iterrows():
    row = r[1]
    index = r[0]
    
    pc = pd.read_sql(sql.format(row["gscraped_postcode"]),conn)
    
    if len(pc)>0:
        df.loc[index, "gscraped_lat"] = pc.loc[0,"lat"]
        df.loc[index, "gscraped_lng"] = pc.loc[0,"lng"]
        
        

In [6]:
df = df.rename(columns={"data": "gscraped_desc", "prison_name":"moj_prison_name"})

In [7]:
df.to_csv("google_scraped_processed.csv", index=False, encoding="utf-8")

In [9]:
df[pd.isnull(df["gscraped_postcode"])]

Unnamed: 0,moj_prison_name,gscraped_desc,gscraped_address,gscraped_opened,gscraped_capacity,gscraped_type,gscraped_cat,gscraped_postcode,gscraped_lat,gscraped_lng
2,Ashfield,HMP Ashfield is now a Category C adult sex off...,Pucklechurch,1999,400.0,male,c,,,
3,Ashwell,HM Prison Ashwell was a Category C men's priso...,,1955,,male,c,,,
9,Blantyre House,HM Prison Blantyre House is a Category C/D res...,Goudhurst,1954,,male,c/d,,,
10,Blundeston,HM Prison Blundeston was a Category C men's pr...,,1963,,male,c,,,
12,Bristol,"HMP Bristol is a Category B men's prison, loca...",,,,male,b,,,
17,Bullwood Hall,HM Prison Bullwood Hall is a former Category C...,,,,"male, yoi",c,,,
22,Channings Wood,HM Prison Channings Wood is a Category C men's...,,July 1974,,male,c,,,
37,Edmunds Hill,HM Prison Highpoint North is a Category C men'...,,2003 (1977),,male,c,,,
53,Grendon,HM Prison Grendon is a Category B men's prison...,,1962,,male,b,,,
73,Kingston,HM Prison Kingston is a former Category B/C me...,,1877,,male,b/c,,,
