In [26]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy
import string
import gensim
import operator
import re

In [3]:
data = pd.read_csv('sem.csv')


In [4]:
data.head()

Unnamed: 0,No,Sentence,Buy/Sell,Condition,Brand,Model,Type,Color,Year
0,1,I want to buy brandnew toyota corolla black ca...,buy,brandnew,toyota,corolla,car,black,2008
1,2,I want to buy brandnew bmw x1 white suv which ...,buy,brandnew,bmw,x1,suv,white,2007
2,3,I want to buy recondition benz c200 green car ...,buy,recondition,benz,c200,car,green,2017
3,4,I want to buy brandnew audi a1 ash car which m...,buy,brandnew,audi,a1,car,ash,2007
4,5,I want to sell brandnew bmw x3 black suv which...,sell,brandnew,bmw,x3,suv,black,2007


In [5]:
'''
Load the dataset from the CSV and save it to 'data_text'
'''
import pandas as pd

data = pd.read_csv('sem.csv', error_bad_lines=False);

# We only need the Headlines text column from the data
data_text = data[:300000][['Sentence']];

data_text['index'] = data_text.index

documents = data_text

In [6]:
'''
Get the total number of documents
'''
print(len(documents))

500


In [7]:
documents[:500]

Unnamed: 0,Sentence,index
0,I want to buy brandnew toyota corolla black ca...,0
1,I want to buy brandnew bmw x1 white suv which ...,1
2,I want to buy recondition benz c200 green car ...,2
3,I want to buy brandnew audi a1 ash car which m...,3
4,I want to sell brandnew bmw x3 black suv which...,4
...,...,...
495,I want to sell recondition toyota KDH blue car...,495
496,I want to buy brandnew benz c180 ash car which...,496
497,I want to sell brandnew bmw x3 black suv which...,497
498,I want to buy brandnew audi a3 white car which...,498


In [8]:
'''
Loading Gensim and nltk libraries
'''
# !pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [8]:
print(WordNetLemmatizer().lemmatize('bought', pos = 'v'))

buy


In [9]:
stemmer = SnowballStemmer("english")

original_words = ['want','buy','corolla','toyota','bmw','audi','benz','black','green','i','sell','KDH','2008','2009','2007','2010','2011','2012','2103','2014','2015','2016','2017','2018','2019','2020','2021','brandnew','recondition','red','white','x1','c200','c180']

singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,want,want
1,buy,buy
2,corolla,corolla
3,toyota,toyota
4,bmw,bmw
5,audi,audi
6,benz,benz
7,black,black
8,green,green
9,i,i


In [10]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    
    result = []
    
    for token in gensim.utils.simple_preprocess(text) :
        
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            
            # TODO: Apply lemmatize_stemming() on the token, then add to the results list
            result.append(lemmatize_stemming(token))
    
    return result

In [11]:
'''
Preview a document after preprocessing
'''
document_num = 20
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original document: ")

words = []

for word in doc_sample.split(' '):
    words.append(word)
    
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['I', 'want', 'to', 'buy', 'brandnew', 'toyota', 'KDH', 'black', 'car', 'which', 'made', 'in', '2008']


Tokenized and lemmatized document: 
['want', 'brandnew', 'toyota', 'black']


In [12]:

# TODO: preprocess all the headlines, saving the list of results as 'processed_docs'
processed_docs = documents['Sentence'].map(preprocess)

In [13]:
'''
Preview 'processed_docs'
'''
processed_docs[:500]

0      [want, brandnew, toyota, corolla, black]
1                       [want, brandnew, white]
2                 [want, recondit, benz, green]
3                        [want, brandnew, audi]
4                 [want, sell, brandnew, black]
                         ...                   
495        [want, sell, recondit, toyota, blue]
496                      [want, brandnew, benz]
497               [want, sell, brandnew, black]
498               [want, brandnew, audi, white]
499        [want, sell, brandnew, toyota, yari]
Name: Sentence, Length: 500, dtype: object

In [14]:

'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [15]:
count = 0

for k, v in dictionary.iteritems():
    
    print(k, v)
    
    count += 1
    
    if count > 500:
        break

0 black
1 brandnew
2 corolla
3 toyota
4 want
5 white
6 benz
7 green
8 recondit
9 audi
10 sell
11 allion
12 carb
13 yari
14 blue


In [16]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 5 times
- words appearing in more than 50% of all documents
'''
# TODO: apply dictionary.filter_extremes() with the parameters mentioned above
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=100000)

In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [18]:
bow_corpus[document_num]

[(0, 1), (2, 1)]

In [23]:
'''
Preview BOW for our sample preprocessed document
'''
# Here document_num is document number 4310 which we have checked in Step 2
bow_doc_500 = bow_corpus[document_num]

for i in range(len(bow_doc_500)):
    
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_500[i][0], 
                                                     dictionary[bow_doc_500[i][0]], 
                                                     bow_doc_500[i][0]))

Word 0 ("black") appears 0 time.
Word 2 ("toyota") appears 2 time.


In [24]:
'''
Create tf-idf model object using models.TfidfModel on 'bow_corpus' and save it to 'tfidf'
'''
from gensim import corpora, models

# TODO
tfidf = models.TfidfModel(bow_corpus)
print(tfidf)

TfidfModel(num_docs=500, num_nnz=1201)


In [22]:
'''
Apply transformation to the entire corpus and call it 'corpus_tfidf'
'''
# TODO
corpus_tfidf = tfidf[bow_corpus]
print(corpus_tfidf[1])

[(3, 1.0)]


In [23]:
'''
Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
'''
from pprint import pprint

for doc in corpus_tfidf:
    
    pprint(doc)
    
    break

[(0, 0.44891263932132824), (1, 0.7726285553324562), (2, 0.44891263932132824)]


In [24]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=10, 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers=2)

In [25]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.230*"white" + 0.191*"sell" + 0.176*"recondit" + 0.150*"toyota" + 0.148*"allion" + 0.058*"audi" + 0.033*"green" + 0.006*"black" + 0.003*"yari" + 0.003*"carb"


Topic: 1 
Words: 0.291*"sell" + 0.260*"toyota" + 0.206*"yari" + 0.057*"recondit" + 0.030*"benz" + 0.030*"corolla" + 0.027*"green" + 0.024*"audi" + 0.024*"black" + 0.022*"allion"


Topic: 2 
Words: 0.395*"recondit" + 0.262*"green" + 0.161*"benz" + 0.121*"audi" + 0.025*"sell" + 0.011*"toyota" + 0.009*"white" + 0.007*"corolla" + 0.005*"black" + 0.003*"yari"


Topic: 3 
Words: 0.241*"sell" + 0.234*"toyota" + 0.224*"recondit" + 0.211*"corolla" + 0.018*"audi" + 0.018*"green" + 0.011*"black" + 0.011*"carb" + 0.011*"yari" + 0.008*"white"


Topic: 4 
Words: 0.530*"white" + 0.182*"audi" + 0.139*"benz" + 0.129*"recondit" + 0.010*"black" + 0.003*"toyota" + 0.003*"corolla" + 0.001*"sell" + 0.001*"green" + 0.001*"yari"


Topic: 5 
Words: 0.469*"audi" + 0.199*"sell" + 0.197*"green" + 0.100*"black" + 0.016*"recondit" + 0.012*"

In [26]:
'''
Define lda model using corpus_tfidf, again using gensim.models.LdaMulticore()
'''
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=10, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=4)

In [27]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model_tfidf.print_topics(-1):
    
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.585*"white" + 0.153*"benz" + 0.060*"audi" + 0.048*"black" + 0.044*"sell" + 0.035*"recondit" + 0.025*"yari" + 0.021*"toyota" + 0.019*"carb" + 0.009*"allion"


Topic: 1 Word: 0.191*"white" + 0.187*"black" + 0.167*"audi" + 0.084*"recondit" + 0.082*"benz" + 0.082*"sell" + 0.077*"corolla" + 0.067*"toyota" + 0.033*"green" + 0.020*"allion"


Topic: 2 Word: 0.333*"audi" + 0.296*"green" + 0.152*"recondit" + 0.092*"white" + 0.084*"sell" + 0.008*"carb" + 0.008*"toyota" + 0.008*"benz" + 0.006*"yari" + 0.006*"allion"


Topic: 3 Word: 0.347*"black" + 0.239*"sell" + 0.101*"white" + 0.086*"audi" + 0.059*"benz" + 0.052*"recondit" + 0.040*"green" + 0.027*"toyota" + 0.024*"corolla" + 0.020*"allion"


Topic: 4 Word: 0.399*"recondit" + 0.261*"benz" + 0.160*"green" + 0.105*"black" + 0.020*"white" + 0.015*"audi" + 0.013*"corolla" + 0.011*"toyota" + 0.008*"allion" + 0.006*"sell"


Topic: 5 Word: 0.345*"benz" + 0.186*"sell" + 0.166*"allion" + 0.094*"toyota" + 0.088*"white" + 0.068*"recondit" +

In [29]:
'''
Text of sample document 20
'''
processed_docs[495]

['want', 'sell', 'recondit', 'toyota', 'blue']

In [30]:
'''
Check which topic our test document belongs to using the LDA Bag of Words model.
'''
document_num = 15
# Our test document is document number 4310

for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):

    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.819974422454834	 
Topic: 0.241*"sell" + 0.234*"toyota" + 0.224*"recondit" + 0.211*"corolla" + 0.018*"audi" + 0.018*"green" + 0.011*"black" + 0.011*"carb" + 0.011*"yari" + 0.008*"white"

Score: 0.02000446245074272	 
Topic: 0.291*"sell" + 0.260*"toyota" + 0.206*"yari" + 0.057*"recondit" + 0.030*"benz" + 0.030*"corolla" + 0.027*"green" + 0.024*"audi" + 0.024*"black" + 0.022*"allion"

Score: 0.020004216581583023	 
Topic: 0.290*"black" + 0.286*"toyota" + 0.228*"corolla" + 0.051*"sell" + 0.049*"benz" + 0.028*"white" + 0.026*"yari" + 0.022*"carb" + 0.018*"recondit" + 0.001*"audi"

Score: 0.0200036633759737	 
Topic: 0.230*"white" + 0.191*"sell" + 0.176*"recondit" + 0.150*"toyota" + 0.148*"allion" + 0.058*"audi" + 0.033*"green" + 0.006*"black" + 0.003*"yari" + 0.003*"carb"

Score: 0.020003195852041245	 
Topic: 0.395*"recondit" + 0.262*"green" + 0.161*"benz" + 0.121*"audi" + 0.025*"sell" + 0.011*"toyota" + 0.009*"white" + 0.007*"corolla" + 0.005*"black" + 0.003*"yari"

Score: 0.0200026

In [35]:
'''
Check which topic our test document belongs to using the LDA TF-IDF model.
'''
# Our test document is document number 4310
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):

    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5984252691268921	 
Topic: 0.343*"corolla" + 0.217*"toyota" + 0.109*"sell" + 0.109*"recondit" + 0.098*"white" + 0.025*"black" + 0.025*"benz" + 0.025*"audi" + 0.025*"yari" + 0.025*"green"

Score: 0.24154122173786163	 
Topic: 0.351*"recondit" + 0.226*"white" + 0.127*"audi" + 0.116*"toyota" + 0.092*"sell" + 0.018*"benz" + 0.018*"black" + 0.018*"green" + 0.018*"yari" + 0.018*"corolla"

Score: 0.020018702372908592	 
Topic: 0.457*"black" + 0.361*"sell" + 0.023*"white" + 0.023*"benz" + 0.023*"audi" + 0.023*"toyota" + 0.023*"recondit" + 0.023*"corolla" + 0.023*"green" + 0.023*"yari"

Score: 0.02000558376312256	 
Topic: 0.424*"benz" + 0.144*"sell" + 0.126*"black" + 0.100*"recondit" + 0.055*"green" + 0.054*"white" + 0.045*"audi" + 0.017*"yari" + 0.017*"toyota" + 0.017*"corolla"

Score: 0.02000495232641697	 
Topic: 0.340*"yari" + 0.220*"toyota" + 0.176*"sell" + 0.038*"white" + 0.038*"black" + 0.038*"benz" + 0.038*"audi" + 0.038*"recondit" + 0.038*"green" + 0.038*"corolla"

Score: 0.02000

In [31]:
unseen_document = "I want to buy a toyota corolla car in 2008"

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):

    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6999733448028564	 Topic: 0.290*"black" + 0.286*"toyota" + 0.228*"corolla" + 0.051*"sell" + 0.049*"benz"
Score: 0.033344052731990814	 Topic: 0.241*"sell" + 0.234*"toyota" + 0.224*"recondit" + 0.211*"corolla" + 0.018*"audi"
Score: 0.03333962708711624	 Topic: 0.291*"sell" + 0.260*"toyota" + 0.206*"yari" + 0.057*"recondit" + 0.030*"benz"
Score: 0.033338434994220734	 Topic: 0.237*"toyota" + 0.220*"yari" + 0.195*"carb" + 0.104*"recondit" + 0.073*"audi"
Score: 0.033336542546749115	 Topic: 0.230*"white" + 0.191*"sell" + 0.176*"recondit" + 0.150*"toyota" + 0.148*"allion"
Score: 0.03333407640457153	 Topic: 0.423*"black" + 0.321*"sell" + 0.115*"audi" + 0.049*"white" + 0.027*"recondit"
Score: 0.03333362936973572	 Topic: 0.395*"recondit" + 0.262*"green" + 0.161*"benz" + 0.121*"audi" + 0.025*"sell"
Score: 0.033333566039800644	 Topic: 0.420*"benz" + 0.169*"black" + 0.165*"sell" + 0.147*"recondit" + 0.028*"white"
Score: 0.03333338722586632	 Topic: 0.530*"white" + 0.182*"audi" + 0.139*"benz" +

In [5]:
from PyDictionary import PyDictionary

def get_synonyms(word):
    
    synonyms = set()
    
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            
            dictionary=PyDictionaryset('sem.csv')

#print(dictionary.printMeanings()) 

#print(dictionary.getMeanings()) 
print (dictionary.getSynonyms())
     

Brandnew has no Synonyms in the API
which has no Synonyms in the API
[{'want': ['thirst', 'yearn', 'begrudge', 'hunger', 'ambition', 'starve', 'take to', 'lust after', 'long', 'fancy', 'feel like', 'care', 'hope', 'crave', 'like', 'wish', 'lust', 'go for', 'itch', 'miss', 'wish well', 'seek', 'spoil', 'hanker', 'desire', 'envy', 'lech after']}, {'buy': ['impulse-buy', 'choose', 'pick out', 'take', 'subscribe to', 'buy out', 'take out', 'subscribe', 'buy up', 'get', 'pick up', 'acquire', 'buy food', 'repurchase', 'pay', 'take over', 'select', 'purchase', 'buy back']}, None, {'recondition': ['condition']}, {'sell': ['deaccession', 'foist off', 'sell off', 'syndicate', 'undercut', 'wholesale', 'realize', 'underprice', 'palm off', 'auction off', 'scalp', 'auctioneer', 'resell', 'move', 'sell short', 'negociate', 'dump', 'sacrifice', 'auction', 'prostitute', 'clear', 'bootleg', 'dispose', 'remainder', 'give', 'fob off', 'change', 'undersell', 'interchange', 'exchange', 'realise', 'retail']}

In [37]:
#describe the unique values in categorical columns
categorical_col = []
for col in data.columns:
    if data[col].dtype == object:
        categorical_col.append(col)
        print(f"{col} => {data[col].unique()}")

Sentence => ['I want to buy brandnew toyota corolla black car which made in 2008'
 'I want to buy brandnew bmw x1 white suv which made in 2007'
 'I want to buy recondition benz c200 green car which made in 2017'
 'I want to buy brandnew audi a1 ash car which made in 2007'
 'I want to sell brandnew bmw x3 black suv which made in 2007'
 'I want to buy recondition bmw x2 red suv which made in 2007'
 'I want to sell recondition toyota allion white car which made in 2007'
 'I want to sell brandnew benz c180 ash car which made in 2014'
 'I want to buy brandnew audi a3 black car which made in 2007'
 'I want to buy brandnew toyota yaris red carb which made in 2018'
 'I want to sell brandnew audi a1 green car which made in 2007'
 'I want to sell recondition benz c180 black car which made in 2012'
 'I want to buy brandnew benz c200 white car which made in 2007'
 'I want to buy recondition audi a3 green car which made in 2008'
 'I want to sell recondition toyota corolla red car which made in 2008

In [39]:
from sklearn.preprocessing import LabelEncoder
#handling categorical variable through label encoding
label_encoder = LabelEncoder()
for col in categorical_col:
    data[col] = label_encoder.fit_transform(data[col])

In [40]:
data.head()

Unnamed: 0,No,Sentence,Buy/Sell,Condition,Brand,Model,Type,Color,Year
0,1,47,0,0,3,6,0,1,2008
1,2,34,0,0,2,7,1,5,2007
2,3,80,0,1,1,5,0,3,2017
3,4,0,0,0,0,1,0,0,2007
4,5,108,1,0,2,9,1,1,2007


In [41]:
#splitting dataset into dependent and independent variables
X = data.drop(['No','Sentence'],axis=1)
y = data.Sentence



In [55]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)#,random_state=42)

model= DecisionTreeClassifier()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

score = accuracy_score(y_test,predictions)
score

0.75

In [56]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

key = ['LogisticRegression','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier']
value = [LogisticRegression(), KNeighborsClassifier(n_neighbors=2, weights = 'uniform'),SVC(kernel = "rbf", random_state=42),DecisionTreeClassifier(random_state=10), RandomForestClassifier(n_estimators=60, random_state=0)]
models = dict(zip(key,value))
models

{'LogisticRegression': LogisticRegression(),
 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=2),
 'SVC': SVC(random_state=42),
 'DecisionTreeClassifier': DecisionTreeClassifier(random_state=10),
 'RandomForestClassifier': RandomForestClassifier(n_estimators=60, random_state=0)}

In [57]:
predicted =[]
for name, algo in models.items():
    model=algo
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    acc = accuracy_score(y_test, predict)
    predicted.append(acc)
    print(name,acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression 0.52
KNeighborsClassifier 0.73
SVC 0.02
DecisionTreeClassifier 0.75
RandomForestClassifier 0.75
