# Creating predictions for identifying restricted access properties in Zoopla data
### What are restricted access properties?
- Restricted access properties are properties such as secure access flats or gated communities
- These are either inconsistently recorded in other data sources or not at all
- Identifying them will improve the Address Register
- It will also help to make field work more efficient if enumerators know they will have difficulty with access

#### Step 1: Import packages/ data

In [None]:
import pandas as pd
import numpy as np
import pickle

import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm, grid_search
import time
import pickle
from bs4 import BeautifulSoup

Below we open 'pickles' for the classifier we built earlier and our training data for X and y
- The pickle basically preserves data/ objects in your code to be used elsewhere- handy so that we do not have to re-create the classifier etc

In [None]:
with open('trigram_LR.pickle', 'rb') as f:
    # This opens the pickled version of our chosen classifier
    trigram_LR = pickle.load(f)
with open('X.pickle', 'rb') as f:
    # This provides our training data for X
    X_train = pickle.load(f)
with open('y.pickle', 'rb') as f:
    # Here we import the training data for y
    y_train = pickle.load(f)

Import sets of Zoopla data. We used WhenFresh data for training so we want to remove these records from the Urban Big Data Centre data we are using for predictions. Therefore we'll import both.

In [None]:
wf= pd.read_csv('whenfresh_data_linked.csv', encoding='latin1', usecols=['udprn', 'id'])

Here we are creating a list of the unique ids in Whenfresh data

In [None]:
uniq__value_list = wf.id.unique().tolist()

Below, we use this list to remove any unique ids shared between UBDC and WF. We'll output to a file as this can be used in the caravan work strand.

In [None]:
chunksize = 10**5
chunks = 0
start = time.time() 
start2 = time.time() 
yes = 0
no = 0
UBDC = "UBDC.csv"
cols = ['description', 'listing_id','unique_id','UPRN', 'LATITUDE', 'LONGITUDE']
# Ensures that only the header for the first chunk is kept (as all the files will be appended)
writeHeader = True
# START: For each chunk in zoopla data:        
for chunk in pd.read_csv(UBDC, encoding='latin1',chunksize=chunksize, usecols=cols):
    #increment count
    chunks +=1
    start_chunk = time.time()
    
    df_yes = chunk[chunk['listing_id'].isin(uniq__value_list)]
    df_no = chunk[~chunk['listing_id'].isin(uniq__value_list)] 
    # Print info after each chunk to keep up to date with progress
    end = time.time() - start_chunk
    
    # 3. Write to csv and tell whether or not to include the header
    if writeHeader is True:    
        df_no.to_csv('UBDC_not_wf.csv', mode='a', header=True, index=False)
        df_yes.to_csv('UBDC_and_wf.csv', mode='a', header=True, index=False)
        writeHeader = False
    else:
        df_no.to_csv('UBDC_not_wf.csv', mode='a', header=False, index=False)
        df_yes.to_csv('UBDC_and_wf.csv', mode='a', header=True, index=False) 

print("\n ****** \n Chunks Processed: \t {}\n Time elapsed (Minutes):  \t{}\n ****** \n".format(chunks, (time.time() - start)//60))

### Define functions to get text in right format

In [None]:
# Define stemmer 
porter_stemmer = PorterStemmer()

# 20 x  faster  for this little code here
cachedStopWords = stopwords.words("english")


def removeStopWords(input):
    exclude = set(string.punctuation)
    output = ' '.join([word for word in input.split() if word not in cachedStopWords])
    output = ''.join(ch for ch in output if ch not in exclude)
        
    return pd.Series(dict(output=output))

  
def remove_non_ascii (text):
    return ''.join(i for i in text if ord(i)<128)

def remove_non_ascii_df (text):
    return pd.Series(dict(output = ''.join(i for i in text if ord(i)<128)))

def stripHTML(input):
    output = BeautifulSoup(input, "lxml").text
    return pd.Series(dict(output = output))

### Perform machine learning predictions using our model

In [None]:
# First we fit our model
trigram_LR.fit(X_train, y_train)

# Define sizes of chunks, iterate over the data to make predictions, structure data and output to csv 
chunksize = 5000
chunks = 0
start = time.time() 
ubdc = 'UBDC_not_wf.csv'
cols = ['description', 'unique_id','UPRN', 'LATITUDE', 'LONGITUDE']
# Ensures that only the header for the first chunk is kept (as all the files will be appended)
writeHeader = True
# START: For each chunk in zoopla data:        
for chunk in pd.read_csv(ubdc, chunksize=chunksize, usecols=cols):
    #increment count
    chunks +=1
    
    print("chunk # ",chunks," minutes past since starting: ",(time.time() - start)//60)
    %time chunk['description']  = chunk['description'].astype(str).apply(lambda x : remove_non_ascii(x))
    %time chunk['description']  = chunk['description'].apply(lambda x : removeStopWords(x))
    %time chunk['description']  = chunk['description'].apply(stripHTML)
    %time chunk['description']  = chunk['description'].apply(lambda x: ' '.join([porter_stemmer.stem(y) for y in x.split()]))
    
    print("Start predictions")
    %time X=chunk['description'].values.astype('U')
    %time predictions= trigram_LR.predict(X)
    %time y_pred_prob = trigram_LR.predict_proba(X)[:,1]
    %time X_LR_output = pd.DataFrame({'X' : X, 'predictions':predictions, 'pred_prob' :y_pred_prob, 'unique_id' :chunk['unique_id'],'lat':chunk['LATITUDE'], 'long':chunk['LONGITUDE']})     
    
    # 3. Write to csv and tell whether or not to include the header
    if writeHeader is True:    
        X_LR_output.to_csv('UBDC_predictions.csv', mode='a', header=True, index=False)
        writeHeader = False
    else:
        X_LR_output.to_csv('UBDC_predictions.csv', mode='a', header=False, index=False)
