In [1]:
import numpy as np
import pandas as pd
import itertools
from pandas import DataFrame
import re
import os
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, GRU, Embedding, Dropout, MaxPooling1D, Conv1D, AveragePooling1D, Flatten,Bidirectional,BatchNormalization, LSTM, SpatialDropout1D
from keras.initializers import TruncatedNormal
from collections import Counter
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sebas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sebas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#cleaning words
def removeStopWords_Lemmatize(s):
    stopset = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(s)
    tempList = [token.lower() for token in tokens if token.lower() not in stopset and  len(token)>2]
    tempList = [lemmatizer.lemmatize(w) for w in tempList]
    return " ".join(x for x in tempList)


In [3]:

def reduce_lengthening(text):

    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)


In [4]:
data=pd.read_csv("gumtree Scrapping data.csv")
data.head(2)

Unnamed: 0.1,Unnamed: 0,Title,Description,Price,Location,Ad Link,Category,Latitude,Longitude
0,0,Maple dining table with chairs (matching Side ...,Beautiful Solid Queensland fiddleback maple di...,$800 negotiable,"Bayside Area, Brighton",/s-ad/brighton/antiques/maple-dining-table-wit...,antiques-art-collectables,-37.9081962,144.9957991
1,1,Art Deco painting (from original).,Art Deco painting on board with frame from ori...,$600,"Glen Eira Area, Bentleigh",/s-ad/bentleigh/art/art-deco-painting-from-ori...,antiques-art-collectables,-37.9185101,145.0409246


In [5]:
# fill the null values with some self-explanotory text
data["Description"]=data["Description"].fillna("Undescribed")
data["Price"]=data["Price"].fillna("Unpriced")

# filter out for non characters
data["Title"]=data["Title"].apply(lambda x: re.sub(r'\W+', ' ', x))
data["Description"]=data["Description"].apply(lambda x: re.sub(r'\W+', ' ', x))
data["Price"]=data["Price"].apply(lambda x: re.sub(r'\W+', ' ', x))

In [6]:
# filter out unrelated words
banned = ['and', 'to', 'the', 'a', 'for','with', 'in','is', 'of', 'or', 'on', 'you', 'are', 'from', 'up', 'available'
               ,'The', 'Have', 'all', 'at', 'as', 'condition' , 's', 'I', 'your', 'can',  'our', 'new',
                'it', 'be', 'We','x', 'This','has','will','only','this', 'New', 'pick','my','an''very', 'if',
               'Please', 'also','more','no', 'but', 'Size', 'Melbourne', 'been', 'A', 'store', 'Brand', 'sale', 'any'
               ,'great','It', 'well', 'price','All', 'us', 'me', 'so', 'just', 'Australia', 't', 'Hey', "i'm", 'hey','hi'
         , 'Hi',"I’m", 'negotiable' ]

In [7]:
f = lambda x: ' '.join([item for item in x.split() if item not in banned])

In [8]:
# concatinate the Title, Description and Price column
data["X"]= data["Title"].apply(str)+" "+data["Description"].apply(str)+" "+data["Price"].apply(str)


In [9]:
out = ' '.join(data["X"].astype(str))

In [10]:
# word count to check which words are irrelevent for model
words = re.findall('\w+',out)
print(Counter(words).most_common(100))

[('and', 3152), ('to', 2012), ('for', 1794), ('the', 1724), ('a', 1691), ('with', 1688), ('in', 1599), ('is', 1297), ('of', 1192), ('2', 769), ('or', 705), ('on', 702), ('you', 647), ('are', 617), ('from', 611), ('1', 594), ('up', 585), ('5', 563), ('not', 528), ('available', 518), ('4', 495), ('The', 489), ('3', 483), ('s', 483), ('all', 452), ('condition', 447), ('as', 440), ('at', 439), ('have', 436), ('I', 419), ('your', 405), ('negotiable', 395), ('new', 391), ('can', 387), ('6', 380), ('our', 358), ('x', 357), ('it', 338), ('be', 330), ('We', 330), ('listed', 310), ('10', 309), ('New', 291), ('8', 289), ('This', 264), ('has', 258), ('only', 250), ('will', 249), ('by', 237), ('this', 235), ('9', 229), ('Brand', 223), ('Pick', 222), ('7', 222), ('12', 219), ('good', 217), ('we', 215), ('one', 214), ('Size', 210), ('that', 207), ('pick', 199), ('Melbourne', 196), ('sale', 195), ('50', 195), ('Box', 194), ('15', 193), ('an', 192), ('very', 190), ('my', 189), ('20', 188), ('no', 186),

In [11]:
#Remove banned words words
data["X"] = data["X"].astype(str).apply(f)

#Remove Stop words

data["X"]=data["X"].apply(lambda x: removeStopWords_Lemmatize(x))

#Fix the extra letters like Yesssss
data["X"]= data["X"].apply(lambda x: reduce_lengthening(x))

In [12]:
x=data["X"]
targets= data["Category"].values
#targets = targets.reshape((len(targets), 1))

tokenizer = Tokenizer( filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n', lower=True)

tokenizer.fit_on_texts(x)

vocab_size = len(tokenizer.word_index) + 1

In [13]:
#tokenize words for model
x_train, x_test, y_train, y_test= tts(x,targets,test_size=0.2, random_state=4)

x_tr=tokenizer.texts_to_sequences(x_train)
x_ts=tokenizer.texts_to_sequences(x_test)

x_tr=pad_sequences(x_tr, maxlen=250)
x_ts=pad_sequences(x_ts, maxlen=250)

np.shape(y_train)

(1925,)

In [14]:
# set up transformations
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    y_test_enc = np_utils.to_categorical(y_test_enc)
    y_train_enc = np_utils.to_categorical(y_train_enc)
    return y_train_enc, y_test_enc



In [15]:
y_train, y_test=prepare_targets(y_train, y_test)
np.shape(y_train)

(1925, 12)

In [16]:
#Neural NetWork Layers

embedding_dim = 100

model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim= embedding_dim, input_length=250, embeddings_initializer='TruncatedNormal'))
model2.add(Conv1D(200, kernel_size=10,strides=1,activation='relu' ))
model2.add(AveragePooling1D(pool_size=8, strides=None, padding='same'))
model2.add(MaxPooling1D(pool_size=5, strides=None, padding='same'))
model2.add(GRU(256, recurrent_dropout=0.3))
#model2.add(Dropout(0.2))
model2.add(Dense(12, activation='softmax'))
model2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model2.summary()



model2.fit(x_tr, y_train,epochs=5,verbose=1,batch_size=32)

_, val_acc2 = model2.evaluate(x_ts, y_test, verbose=0)
print('validation_accuracy is: ', val_acc2)



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 100)          1169100   
_________________________________________________________________
conv1d (Conv1D)              (None, 241, 200)          200200    
_________________________________________________________________
average_pooling1d (AveragePo (None, 31, 200)           0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 7, 200)            0         
_________________________________________________________________
gru (GRU)                    (None, 256)               351744    
_________________________________________________________________
dense (Dense)                (None, 12)                3084      
Total params: 1,724,128
Trainable params: 1,724,128
Non-trainable params: 0
______________________________________________

In [18]:
#import packages for Naives bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [19]:
#list of categories
singletargets = list(dict.fromkeys(targets))
type(singletargets)
singletargets


['antiques-art-collectables',
 'baby-children',
 'boats-jet-skis',
 'books-music-games',
 'automotive',
 'clothing-jewellery',
 'electronics-computer',
 'home-garden',
 'pets',
 'real-estate',
 'services-for-hire',
 'sport-fitness']

In [20]:
#split train and test
x_train, x_test, y_train, y_test= tts(x,targets,test_size=0.2, random_state=4)

x_tr=tokenizer.texts_to_sequences(x_train)
x_ts=tokenizer.texts_to_sequences(x_test)

x_tr=pad_sequences(x_tr, maxlen=250)
x_ts=pad_sequences(x_ts, maxlen=250)

np.shape(y_train)

(1925,)

In [21]:
#Create the DTM first
cv=CountVectorizer(stop_words='english')
train_dtm=cv.fit_transform(x_train)
test_dtm=cv.transform(x_test)

#Fit the model
nb=MultinomialNB()
nb.fit(train_dtm,y_train)

MultinomialNB()

In [22]:
#predict
predicted=nb.predict(test_dtm)
score=nb.score(test_dtm,y_test)
print('Accuracy of Naive Bayes :')
print(score*100.0)

Accuracy of Naive Bayes :
85.68464730290457


In [23]:
# Report of categories with accuracy 
from sklearn import metrics
print(metrics.classification_report(y_true= y_test,y_pred = predicted))


                           precision    recall  f1-score   support

antiques-art-collectables       0.82      0.75      0.78        36
               automotive       0.90      0.98      0.94        46
            baby-children       0.80      0.81      0.80        43
           boats-jet-skis       0.90      0.97      0.94        38
        books-music-games       0.93      0.70      0.80        37
       clothing-jewellery       0.88      0.86      0.87        35
     electronics-computer       0.90      0.94      0.92        49
              home-garden       0.76      0.78      0.77        32
                     pets       0.82      0.91      0.86        34
              real-estate       0.90      0.94      0.92        47
        services-for-hire       0.89      0.80      0.85        41
            sport-fitness       0.77      0.77      0.77        44

                 accuracy                           0.86       482
                macro avg       0.86      0.85      0.85    

In [24]:
#saving Model as a pickle object
import pickle
f = open('my_classifier.pickle', 'wb')
pickle.dump(nb, f)
f.close()

In [25]:
#loading the model to use in prediction program
f = open('my_classifier.pickle', 'rb')
nb = pickle.load(f)
f.close()