In [1]:
import numpy as np
import pandas as pd
import itertools
from pandas import DataFrame
import re
import os
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, GRU, Embedding, Dropout, MaxPooling1D, Conv1D, AveragePooling1D, Flatten,Bidirectional,BatchNormalization, LSTM, SpatialDropout1D
from keras.initializers import TruncatedNormal
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sebas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:

def removeStopWords_Lemmatize(s):
    stopset = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(s)
    tempList = [token.lower() for token in tokens if token.lower() not in stopset and  len(token)>2]
    tempList = [lemmatizer.lemmatize(w) for w in tempList]
    return " ".join(x for x in tempList)


In [3]:

def reduce_lengthening(text):

    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)


In [4]:
data=pd.read_csv("gumtree Scrapping data.csv")
data.head(2)

Unnamed: 0.1,Unnamed: 0,Title,Description,Price,Location,Ad Link,Category,Latitude,Longitude
0,0,Maple dining table with chairs (matching Side ...,Beautiful Solid Queensland fiddleback maple di...,$800 negotiable,"Bayside Area, Brighton",/s-ad/brighton/antiques/maple-dining-table-wit...,antiques-art-collectables,-37.9081962,144.9957991
1,1,Art Deco painting (from original).,Art Deco painting on board with frame from ori...,$600,"Glen Eira Area, Bentleigh",/s-ad/bentleigh/art/art-deco-painting-from-ori...,antiques-art-collectables,-37.9185101,145.0409246


In [5]:
# fill the null values with some self-explanotory text
data["Description"]=data["Description"].fillna("Undescribed")
data["Price"]=data["Price"].fillna("Unpriced")

# filter out for non characters
data["Title"]=data["Title"].apply(lambda x: re.sub(r'\W+', ' ', x))
data["Description"]=data["Description"].apply(lambda x: re.sub(r'\W+', ' ', x))
data["Price"]=data["Price"].apply(lambda x: re.sub(r'\W+', ' ', x))

In [6]:
banned = ['and', 'to', 'the', 'a', 'for','with', 'in','is', 'of', 'or', 'on', 'you', 'are', 'from', 'up', 'available'
               ,'The', 'Have', 'all', 'at', 'as', 'condition' , 's', 'I', 'your', 'can',  'our', 'new',
                'it', 'be', 'We','x', 'This','has','will','only','this', 'New', 'pick','my','an''very', 'if',
               'Please', 'also','more','no', 'but', 'Size', 'Melbourne', 'been', 'A', '7', 'store', 'Brand', 'sale', 'any'
               ,'great','It', 'well', 'price','All', 'us', 'me', 'so', 'just', 'Australia', 't', 'Hey', "i'm", 'hey','hi'
         , 'Hi',"Iâ€™m"]

In [8]:
f = lambda x: ' '.join([item for item in x.split() if item not in banned])

In [11]:
# concatinate the Title, Description and Price column
data["X"]= data["Title"].apply(str)+" "+data["Description"].apply(str)+" "+data["Price"].apply(str)

#Remove banned words words
data["X"] = data["X"].astype(str).apply(f)

#Remove Stop words

data["X"]=data["X"].apply(lambda x: removeStopWords_Lemmatize(x))

#Fix the extra letters like Yesssss
data["X"]= data["X"].apply(lambda x: reduce_lengthening(x))



0       Maple dining table chairs matching Side Board ...
1       Art Deco painting original Art Deco painting b...
2       Art Deco painting canvas Art Deco painting can...
3       Horse Racing Memorabilia Black Caviar Black ca...
4       Pokemon Cards Wanted Cash Paid Same Day Pickup...
                              ...                        
2402    BMX STYLE BIKE got BMX style bike 20 Weels col...
2403    Everlast Precision Leather Double Ended Strike...
2404    Mag Bike Dual Purpose Bodyworx ADPE 8 Levels R...
2405    UNDER ARMOUR mens compression shorts M NEW WIT...
2406    Everlast Adjustable Speedball Platform In Box ...
Name: X, Length: 2407, dtype: object

In [None]:
x=data["X"]
targets= data["Category"].values
#targets = targets.reshape((len(targets), 1))

tokenizer = Tokenizer( filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n', lower=True)

tokenizer.fit_on_texts(x)

vocab_size = len(tokenizer.word_index) + 1

In [None]:
x_train, x_test, y_train, y_test= tts(x,targets,test_size=0.2, random_state=4)

x_tr=tokenizer.texts_to_sequences(x_train)
x_ts=tokenizer.texts_to_sequences(x_test)

x_tr=pad_sequences(x_tr, maxlen=250)
x_ts=pad_sequences(x_ts, maxlen=250)



In [None]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    y_test_enc = np_utils.to_categorical(y_test_enc)
    y_train_enc = np_utils.to_categorical(y_train_enc)
    return y_train_enc, y_test_enc



In [None]:
y_train, y_test=prepare_targets(y_train, y_test)

In [None]:


embedding_dim = 100

model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim= embedding_dim, input_length=250, embeddings_initializer='TruncatedNormal'))
model2.add(Conv1D(200, kernel_size=10,strides=1,activation='relu' ))
model2.add(AveragePooling1D(pool_size=8, strides=None, padding='same'))
model2.add(MaxPooling1D(pool_size=5, strides=None, padding='same'))
model2.add(GRU(256, recurrent_dropout=0.3))
#model2.add(Dropout(0.2))
model2.add(Dense(12, activation='softmax'))
model2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model2.summary()



model2.fit(x_tr, y_train,epochs=5,verbose=1,batch_size=32)

_, val_acc2 = model2.evaluate(x_ts, y_test, verbose=0)
print('validation_accuracy is: ', val_acc2)

