# Using textual descriptions, find restricted access properties within Zoopla data
#### What are restricted access properties?¶
- Restricted access properties are properties such as secure access flats or gated communities
- These are either inconsistently recorded in other data sources or not at all
- Identifying them will improve the Address Register
- It will also help to make field work more efficient if enumerators know they will have difficulty with access


### Import relevant packages

In [None]:
import pandas as pd
from pandas import Timestamp
import statsmodels
from operator import itemgetter
import string
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score,classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk import bigrams
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import pickle

### Create pipelines for each classifier (to be used further down)
- Classifiers include; Logistic regression, Support Vector Machines, Neural Networks, Stochastic Gradient Descent and Naive Bayes

In [None]:
# 1) LOGISTIC REGRESSION
LR_trigram_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 3),
                                   tokenizer=word_tokenize,         
                                   max_features=10000)),
    ('transformer', TfidfTransformer()) ,    
    ('classifier', LogisticRegression(C=2.0, penalty='l2'))     
    
   
])

# 2) SUPPORT VECTOR MACHINES
svm_trigram_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 3),
                                   tokenizer=word_tokenize,         
                                   max_features=10000)),
    ('classifier', LinearSVC())     
    
   
])


# 3) NAIVE BAYES
NB_trigram_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 3),
                                   tokenizer=word_tokenize,         
                                   max_features=10000)),

    ('classifier', MultinomialNB())    
    
   
])

# 4) STOCHASTIC GRADIENT DESCENT REGRESSION
SGD_trigram_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 3),
                                   tokenizer=word_tokenize,         
                                   max_features=10000)),
    ('classifier', SGDClassifier()) 
    
   
])

# 5) RANDOM FORESTS
random_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 2),
                                   tokenizer=word_tokenize,         
                                   max_features=10000)),
    ('transformer', TfidfTransformer()) ,    
    ('classifier', RandomForestClassifier())     
    
   
])


#6) NEURAL NETWORKS
NN_trigram_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 3),
                                   tokenizer=word_tokenize,       
                                   max_features=10000)),

    ('transformer', TfidfTransformer()),
    ('classifier', MLPClassifier(learning_rate_init=0.01,
                    hidden_layer_sizes=10, max_iter=100, activation='tanh', verbose=100,
                    early_stopping=True, validation_fraction=0.05, alpha=1e-10)) 
])

### Create a function to show what the most informative features of restricted access properties are for each classifier
- This will display the top 20 words

In [None]:
def show_most_informative_features(model, text=None, n=20):
    """
    Accepts a Pipeline with a classifer and a TfidfVectorizer and computes
    the n most informative features of the model. If text is given, then will
    compute the most informative features for classifying that text.
    Note that this function will only work on linear models with coefs_
    """
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {} model.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append("Classified as: {}".format(model.predict([text])))
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(cp, fnp, cn, fnn)
        )

    return "\n".join(output)

### Read in the data and define labels
- Import csvs and clerical review data

In [None]:
wf= pd.read_csv('clean_wf.csv', encoding='latin1')

Get rid of listings with missing descriptions (cannot use these)

In [None]:
wf= wf[wf['description_parsed'] != 'missing']

Creating a subset of records that are not restricted access in order to even up the training set. To do this, we negated search terms you'd expect to generate restricted access and created a sample (which was then checked to see if there was anything unexpected).

In [None]:
wflabelfalse = wf[~wf.description_parsed.str.contains('secure access|gated community|concierge|development is gated|gated access|gated development|gated cul-de-sac|private gated mews')]

In [None]:
wfsamplefalse = wflabelfalse.sample(n=1200)

In [None]:
wfsamplefalse['label'] = 0 # Give these a label of 0 for false, we'll import true values further down

In [None]:
df1 =  wfsamplefalse[['description_parsed','label']]

In [None]:
df1.to_csv('df1.csv')

Spot checks on the descriptions

In [None]:
pd.set_option("max_colwidth", 50)
df1.description_parsed[111:112]

Read in records from the wf data that have been clerically reviewed

In [None]:
df2 = pd.read_csv('clerical_restricted.csv',encoding = 'latin1')

Merge dataframes to give a balanced set ready for analysis

In [None]:
frames = [df1, df2]
df = pd.concat(frames)
df.shape

In [None]:
df['text'] = df['description_parsed'].copy()

In [None]:
from nltk.corpus import stopwords
def removeStopWords(input):
    exclude = set(string.punctuation)
    output = ' '.join([word for word in input.split() if word not in stopwords.words("english")])
    output = ''.join(ch for ch in output if ch not in exclude)
        
    return pd.Series(dict(output=output))

df['ml_text'] = df['text'].apply(lambda x: removeStopWords(x))

In [None]:
def remove_non_ascii (text):
    return ''.join(i for i in text if ord(i)<128)
                   
def replacenon_ascii_w_space (text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])

df['ml_text'] = df['ml_text'].apply(lambda x: remove_non_ascii(x))

### Stemming the textual descriptions
- We do this because we want to reduce the words to their minimum most informative meaning. 
- Therefore when doing the machine learning, the classifiers can be informed about when the text refers to the same word e.g. house, housing and houses all become hous

Import and create the stemmer

In [None]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

Run this for each description

In [None]:
df['stem_text']  = df['ml_text'].apply(lambda x: ' '.join([porter_stemmer.stem(y) for y in x.split()]))
df.drop(['Unnamed: 0', 'description_parsed', 'text', 'token_text', 'ml_text'], axis=1, inplace=True)

In [None]:
df.head()

### Preparation for Machine Learning
- Here we need to split the data into X and Y values
- The X being the textual descriptions, y being the 'truth' labels
- In training, the classifiers will use both to learn and make associations between what a restricted access property is and what it isn't

In [None]:
X=df['stem_text'].values.astype('U')
y=df['label'].values
z= df['id'].values.astype('U')

In [None]:
#split between training set and test set
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2, random_state=0)

In [None]:
print(LR_trigram_clf.fit(train_X, train_y))

In [None]:
print(LR_trigram_clf.score(test_X, test_y))

In [None]:
# 5-fold crossvalidation
scores = cross_val_score(LR_trigram_clf, X, y, cv=5)

In [None]:
print("-----------------LR trigram pipeline------")
print (scores)
print (np.mean(scores))

In [None]:
print(show_most_informative_features(LR_trigram_clf))

### SVM trigrams classifier

In [None]:
print(svm_trigram_clf.fit(train_X, train_y))
print(svm_trigram_clf.score(test_X, test_y))

In [None]:
# 5-fold crossvalidation

scores = cross_val_score(svm_trigram_clf, X, y, cv=5)

print("-----------------SVM trigram pipeline------")
print (scores)
print (np.mean(scores))

In [None]:
print(show_most_informative_features(svm_trigram_clf))

In [None]:
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects
    """
    
    plt.figure()
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X_new, y, cv=5, n_jobs=1, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid("on") 
    if ylim:
        plt.ylim(ylim)
    plt.title(title)

In [None]:
import matplotlib.pyplot as plt
from sklearn import svm, grid_search

count_vect = TfidfVectorizer()
X_new = count_vect.fit_transform(test_X)

print(X_new.shape)
print(y.shape)
print(X.shape)

### Naive Bayes trigrams

In [None]:
print(NB_trigram_clf.fit(train_X, train_y))
print(NB_trigram_clf.score(test_X, test_y))

In [None]:
# 5-fold cross validation

scores = cross_val_score(NB_trigram_clf, X, y, cv=5)

print("-----------------NB trigram pipeline------")
print (scores)
print (np.mean(scores))

In [None]:
print(show_most_informative_features(NB_trigram_clf))

### Neural Networks

In [None]:
print(NN_trigram_clf.fit(train_X, train_y))
print(NN_trigram_clf.score(test_X, test_y))

In [None]:
# 5-fold cross validation

scores = cross_val_score(NN_trigram_clf, X, y, cv=5)

print("-----------------NN trigram pipeline------")
print (scores)
print (np.mean(scores))

### Stochastic Gradient Descent

In [None]:
print(SGD_trigram_clf.fit(train_X, train_y))
print(SGD_trigram_clf.score(test_X, test_y))

In [None]:
# 5-fold crossvalidation

scores = cross_val_score(SGD_trigram_clf, X, y, cv=5)

print("-----------------SGD trigram pipeline------")
print (scores)
print (np.mean(scores))

In [None]:
print(show_most_informative_features(SGD_trigram_clf))

## Classification report

In [None]:
# SK Learn classification report
from sklearn import svm, grid_search

count_vect = TfidfVectorizer()
X_transformed = count_vect.fit_transform(train_X)

count_vect.vocabulary_.get(u'restricted')

clf = LogisticRegression().fit(X_transformed, train_y)
X_new_counts = count_vect.transform(test_X)

y_pred = clf.predict(X_new_counts)
y_pred_prob = LR_trigram_clf.predict_proba(test_X)[:,1]
print(classification_report(test_y, y_pred))


In [None]:
with open('clf.pickle', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)

In [None]:
X_transformed.A

In [None]:
y_pred_prob = LR_trigram_clf.predict_proba(test_X)[:,1]
LR_output = pd.DataFrame({'test_X':test_X,'Y_pred':y_pred,'test_y':test_y,'Y_pred_prob':y_pred_prob})
sout = LR_output.sort_values(['Y_pred_prob', 'Y_pred'], ascending=[False, False])
sout.to_html("LRCLFtest.htm")

In [None]:
LR_output.head()

## Create plots
- ROC curve
- Calibration plot
- Confusion matrix

ROC Curve- This is useful  to understand the true positive and false positive rates

In [None]:
from sklearn import metrics
from sklearn.metrics import roc_curve, auc,brier_score_loss
from ggplot import *

preds = LR_trigram_clf.predict_proba(test_X)[:,1]
fpr, tpr, _ = metrics.roc_curve(test_y, preds)

df_pred = pd.DataFrame(dict(fpr=fpr, tpr=tpr))

    
auc = metrics.auc(fpr,tpr)
prob_pos = LR_trigram_clf.predict_proba(test_X)[:, 1]
br=brier_score_loss(test_y, prob_pos)
print("Area under Curve: ",auc,"   Brier Score (the lower the better) ",br)

In [None]:
ggplot(df_pred, aes(x='fpr', y='tpr')) +\
    geom_line() + \
    geom_abline(linetype='dashed')

Calibration curve- Useful to show how reliable the classifier is in comparison to what a perfectly calibrated model would be

In [None]:
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
%matplotlib inline

In [None]:
plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(LR_trigram_clf, 'trigram SVM ')]:
    clf.fit(train_X, train_y)
    if hasattr(clf, "predict_proba"):
        prob_pos = clf.predict_proba(test_X)[:, 1]
    else:  # use decision function
        prob_pos = clf.decision_function(test_X)
        prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    fraction_of_positives, mean_predicted_value = \
        calibration_curve(test_y, prob_pos, n_bins=10)

    ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
             label="%s" % (name, ))

    ax2.hist(prob_pos, range=(0, 1), bins=20, label=name,
             histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)

plt.tight_layout()
plt.show()

Confusion matrix to show True Positives/ False Positives/ False Negatives/ True Negatives

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools

In [None]:
cm = confusion_matrix(test_y, y_pred)
np.set_printoptions(precision=2)
print(cm)
class_names = ['Positive (Restricted Access)', 'Negative (not Restriced Access)']


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    
    
    classes = class_names
    tick_marks = np.arange(len(classes)) 
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
   
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center", fontsize= 20,
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# Compute confusion matrix
cnf_matrix = confusion_matrix(test_y, y_pred)
np.set_printoptions(precision=2)

# Show confusion matrix in a separate window

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,normalize=True,
                      title='Normalized confusion matrix')

In [None]:
LR_trigram_clf.get_params()

### Performing grid search
- This will try different parameters for the model given specified metrics and return which parameters work the best.
- We can use this to tune the model in order to attain optimum performance

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from scipy.stats import randint as sp_randint
from time import time

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.1, 1.0], 'kernel': ['linear','rbf']}

In [None]:
clf = GridSearchCV(LR_trigram_clf, param_grid)

In [None]:
clf_GS = GridSearchCV(cv=5, estimator=LogisticRegression(C=1.0, intercept_scaling=1, dual=False, fit_intercept=True, penalty='l2', tol=0.0001),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})

In [None]:
clf_GS.fit(X_transformed, train_y)

Check best aspects to help with/ facilitate tuning of earlier models

In [None]:
clf_GS.best_params_

In [None]:
clf_GS.best_estimator_

In [None]:
clf_GS.best_score_

In [None]:
LR_trigram_clf.get_params().keys()

In [None]:
cv_results = pd.DataFrame(clf_GS.cv_results_)
cv_results

In [None]:
LR_trigram_clf.get_params().keys()

Need to open this in new book, do predictions based on this and see if it is the same
- Therefore we pickle the data and the classifier

In [None]:
with open('trigram_LR.pickle', 'wb') as f:
    pickle.dump(LR_trigram_clf, f, pickle.HIGHEST_PROTOCOL)
with open('X.pickle', 'wb') as f:
    pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)
with open('y.pickle', 'wb') as f:
    pickle.dump(y, f, pickle.HIGHEST_PROTOCOL)