# Introduction

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import nltk
import string
import spacy
en_core = spacy.load('en_core_web_sm')
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.tokenize.regexp import regexp_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion


In [12]:
train_df=pd.read_csv('./dataset/train_file.csv')
test_df=pd.read_csv('./dataset/test_file.csv')

In [42]:
train_df['Action Type'].unique()

array(['ADD/ALT', 'DEMOLITION', 'ALTER', 'NEW',
       'TREE/VEGETATION MAINT/RESTORE', 'NO CONSTRUCTION', nan, 'TEMP',
       'CURB CUT', 'GRADING', 'SHORELINE EXEMPTION ONLY',
       'SITE MONITORING ONLY', 'DECONSTRUCTION', 'RELOCATION',
       'FLOODPLAIN LICENSE ONLY', 'TREE PROTECTION EXEMPTION',
       'DRAINAGE APPROVAL'], dtype=object)

In [14]:
train_df.dtypes

Application/Permit Number            int64
Permit Type                         object
Address                             object
Description                         object
Action Type                         object
Work Type                           object
Applicant Name                      object
Application Date                    object
Issue Date                          object
Final Date                          object
Expiration Date                     object
Status                              object
Contractor                          object
Permit and Complaint Status URL     object
Master Use Permit                  float64
Latitude                           float64
Longitude                          float64
Location                            object
Category                            object
dtype: object

In [144]:
#### train
train_df['Description'] = train_df['Description'].fillna('')
train_df['clean_description']=[' '.join([w for w in x.lower().split() if w not in stop]) for x in train_df['Description'].tolist()]
##### test
test_df['Description'] = test_df['Description'].fillna('')
test_df['clean_description']=[' '.join([w for w in x.lower().split() if w not in stop]) for x in test_df['Description'].tolist()]


In [145]:

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '',string.punctuation))

def lemmatize(text):
    tokens=regexp_tokenize(text,pattern='\s+',gaps=True) #https://towardsdatascience.com/benchmarking-python-nlp-tokenizers-3ac4735100c5
    return ' '.join([lemmatizer.lemmatize(w) for w in tokens])
###### Train
train_df['clean_description']=train_df['clean_description'].apply(lemmatize)
train_df['clean_description']=train_df['clean_description'].apply(remove_punctuation)
print(train_df['clean_description'])

#### Test
test_df['clean_description']=test_df['clean_description'].apply(lemmatize)
test_df['clean_description']=test_df['clean_description'].apply(remove_punctuation)
print(train_df['clean_description'])
    

0        alteration single family residence portions ma...
1        change use permit restaurant residential const...
2        construct interior alteration existing office ...
3        need remove fire escape part building fire saf...
4        channel modification thornton creek stream res...
                               ...                        
33534       interior repair modification existing building
33535    construct south duplex units cd one surface pa...
33536    establish use construction single family resid...
33537    construction new 1923 sf single story wood fra...
33538    construct repair existing garage accessory exi...
Name: clean_description, Length: 33539, dtype: object
0        alteration single family residence portions ma...
1        change use permit restaurant residential const...
2        construct interior alteration existing office ...
3        need remove fire escape part building fire saf...
4        channel modification thornton creek stream res...
  

### Deep Learning for Training 
<!--  -->

In [149]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.layers import Flatten,LSTM,Bidirectional
from keras.layers import Embedding,SpatialDropout1D
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.callbacks import EarlyStopping

In [159]:
t = Tokenizer()
t.fit_on_texts(train_df['clean_description'])
vocab_size = len(t.word_index) + 1
print(vocab_size)
# integer encode the documents
encoded_docs = t.texts_to_sequences(train_df['clean_description'])
# print(encoded_docs)
# pad documents to a max length of 25 words
max_length = 25
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs.shape)

17118
(33539, 25)


In [165]:

############test Dta
test_t = Tokenizer()
test_t.fit_on_texts(test_df['clean_description'])
vocab_size_test = len(test_t.word_index) + 1
# integer encode the documents
encoded_test_docs = test_t.texts_to_sequences(test_df['clean_description'])
# print(encoded_docs)
# pad documents to a max length of 25 words
max_length = 25
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
print(padded_test_docs.shape)

(22360, 25)


In [166]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./dataset/glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))


Loaded 400000 word vectors.


In [167]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [168]:
Y = pd.get_dummies(train_df['Category']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
Y[0]

Shape of label tensor: (33539, 5)
(30185, 10) (30185, 5)
(3354, 10) (3354, 5)


array([0, 0, 0, 0, 1], dtype=uint8)

In [172]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=25, trainable=False)
model.add(e)
# model.add(Flatten())
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(5, activation='relu'))
model.add(Activation('softmax'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, Y, epochs=100,validation_split=0.1)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, Y, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 100)           1711800   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 1285      
_________________________________________________________________
activation_3 (Activation)    (None, 5)                 0         
Total params: 1,947,581
Trainable params: 235,781
Non-trainable params: 1,711,800
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 

Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Accuracy: 91.878110


In [None]:
pred=model.predict_classes(padded_test_docs)

In [68]:
LE_target=LabelEncoder()
y=train_df['Category']
y=LE_target.fit_transform(y)
labels=pred.flatten()
test_df['Category']=LE_target.inverse_transform(labels)

In [69]:
test_df

Unnamed: 0,Application/Permit Number,Permit Type,Address,Description,Action Type,Work Type,Applicant Name,Application Date,Issue Date,Final Date,Expiration Date,Status,Contractor,Permit and Complaint Status URL,Master Use Permit,Latitude,Longitude,Location,clean_description,Category
0,6425384,Construction,8348 18TH AVE NW,"Construct 6' retaining wall to create walkway,...",ADD/ALT,No plan review,"JOSSART, STEVE",2014-07-02T00:00:00,2014-07-02T00:00:00,2015-06-29T00:00:00,2016-01-02T00:00:00,Permit Closed,,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,,47.690123,-122.379795,"{'latitude': '47.69012273', 'human_address': '...",construct 6 retaining wall create walkway per ...,COMMERCIAL
1,6496502,Construction,3120 S FRONTENAC ST,Like for like repair of framing at porches of ...,ADD/ALT,No plan review,"RUNDLE, DANIEL",2015-11-12T00:00:00,2015-11-12T00:00:00,2016-02-10T00:00:00,2017-05-12T00:00:00,Permit Closed,SCHULTIS SUNDBERG INC.,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,,47.540111,-122.290535,"{'latitude': '47.54011095', 'human_address': '...",like like repair framing porch existing 2 fami...,SINGLE FAMILY / DUPLEX
2,6622347,Demolition,4031 S WILLOW ST,Demolish existing single family dwelling. Subj...,DEMOLITION,No plan review,"WEBER, JULIAN",2017-10-04T00:00:00,,,,Reviews Completed,,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,3024280.0,47.540470,-122.281410,"{'latitude': '47.54046991', 'human_address': '...",demolish existing single family dwelling subje...,SINGLE FAMILY / DUPLEX
3,6565685,Construction,3940 1ST AVE NE,Establish use as rowhouse and construct new du...,NEW,Plan Review,"NOVION, SHAUN",2017-02-14T00:00:00,2017-10-20T00:00:00,,2019-04-20T00:00:00,Permit Issued,,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,3025762.0,47.654986,-122.327467,"{'latitude': '47.65498619', 'human_address': '...",establish use rowhouse construct new duplex su...,COMMERCIAL
4,6487370,Construction,5317 BALLARD AVE NW,Tenant improvement to existing women's boutiqu...,ADD/ALT,No plan review,DYNA CONTRACTING INC,2017-07-19T00:00:00,2017-07-19T00:00:00,2018-01-17T00:00:00,2019-01-19T00:00:00,Permit Closed,DYNA CONTRACTING INC,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,,47.666714,-122.383900,"{'latitude': '47.66671437', 'human_address': '...",tenant improvement existing womens boutique su...,COMMERCIAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22355,6284796,Construction,6701 6TH AVE S,TI - add additional locker room,ADD/ALT,Plan Review,"FADDEN, BOB",,,,,,VIOX CORPORATION,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,,47.543799,-122.326393,"{'latitude': '47.54379887', 'human_address': '...",ti add additional locker room,SINGLE FAMILY / DUPLEX
22356,6242102,Construction,2001 8TH AVE,Non-structural building improvement on floor 3...,ADD/ALT,Plan Review,"TULLY, TRICIA",,,,,Initial Information Collected,,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,,47.615789,-122.337271,"{'latitude': '47.61578904', 'human_address': '...",nonstructural building improvement floor 3r ad...,COMMERCIAL
22357,6593264,Construction,3200 SW 97TH ST,Interior alterations to existing single family...,ADD/ALT,No plan review,"VEIT, TYLER",2017-04-24T00:00:00,2017-04-24T00:00:00,,2018-10-24T00:00:00,Permit Issued,,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,,47.516830,-122.373745,"{'latitude': '47.51683006', 'human_address': '...",interior alteration existing single family res...,SINGLE FAMILY / DUPLEX
22358,6440664,Construction,4140 ROOSEVELT WAY NE,"Re-roof as overlay to commercial structure, su...",ADD/ALT,No plan review,"NELSON, LANE",2014-10-09T00:00:00,2014-10-09T00:00:00,2016-04-04T00:00:00,2016-04-09T00:00:00,Permit Closed,,{'url': 'http://web6.seattle.gov/dpd/PermitSta...,,47.657730,-122.317570,"{'latitude': '47.65773041', 'human_address': '...",reroof overlay commercial structure subject fi...,COMMERCIAL


In [70]:
NLP_model=pd.DataFrame(test_df,columns=['Application/Permit Number','Category'])
NLP_model.to_csv('submission1.csv')

In [171]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=25, trainable=False)
model.add(e)
# model.add(Flatten())
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='relu'))
model.add(Activation('softmax'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, Y, epochs=5,validation_split=0.1)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, Y, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 100)           1711800   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 645       
_________________________________________________________________
activation_2 (Activation)    (None, 5)                 0         
Total params: 1,829,693
Trainable params: 117,893
Non-trainable params: 1,711,800
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 29.154119
