In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd #read files
from bs4 import BeautifulSoup
import requests

In [2]:
df = pd.read_csv('/Users/Tico/Documents/Masters/LaMona/ModelTraining/combinedElements.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,html,name,price,category
0,0,"<li class=""mr-3 float-left"">\n<a class=""text-b...",,,li
1,1,"<li class=""mr-3 float-left"">\n<a class=""text-b...",,,li
2,2,"<li class=""mr-3 float-left"">\n<a class=""text-b...",,,li
3,3,"<li class=""breadcrumb-item"">\n<a class=""btn bt...",,,li
4,4,"<li class=""breadcrumb-item"">\n<a class=""btn bt...",,,li
...,...,...,...,...,...
5226,5226,"<li class=""grid__item"">\n<link href=""//thecabi...",2 Drawer TV Stand M1,£239.59 GBP,Product
5227,5227,"<li class=""grid__item"">\n<link href=""//thecabi...",Mid Quad Bookcase M1,£0.00 GBP,Product
5228,5228,"<li class=""grid__item"">\n<link href=""//thecabi...",Low Quad Bookcase M1,£181.76 GBP,Product
5229,5229,"<li class=""grid__item"">\n<link href=""//thecabi...",Low 6 Drawer Chest M2,£364.62 GBP,Product


In [5]:
X = df['html']
y = df['category']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=22)

In [11]:
#tdidf turns textual data into numerical format
pipeMNB = Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])
pipeCNB = Pipeline([('tfidf',TfidfVectorizer()),('clf',ComplementNB())])
pipeSVC = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])



In [13]:
#training mnb
pipeMNB.fit(X_train, y_train)
predictMNB = pipeMNB.predict(X_test)
print(f"MNB score is: "+str(accuracy_score(y_test,predictMNB)))

MNB score is: 0.9121298949379179


In [14]:
#training cnb
pipeCNB.fit(X_train, y_train)
predictCNB = pipeCNB.predict(X_test)
print(f"CNB score is: "+str(accuracy_score(y_test,predictCNB)))

CNB score is: 0.9140401146131805


In [15]:
#training svc
pipeSVC.fit(X_train, y_train)
predictSVC = pipeSVC.predict(X_test)
print(f"SVC score is: "+str(accuracy_score(y_test,predictSVC)))



SVC score is: 0.9598853868194842


In [19]:
#check accuracy of models
print(classification_report(y_test,predictSVC))

              precision    recall  f1-score   support

     Product       1.00      1.00      1.00       398
         div       0.96      0.92      0.94       354
          li       0.91      0.96      0.93       295

    accuracy                           0.96      1047
   macro avg       0.96      0.96      0.96      1047
weighted avg       0.96      0.96      0.96      1047



In [21]:
test = """<li class="grid__item">
<link href="//thecabinetshop.co.uk/cdn/shop/t/18/assets/component-rating.css?v=24573085263941240431657786494" media="all" rel="stylesheet" type="text/css">


</div>
</link></li>
"""

In [23]:
pipeSVC.predict([test])


array(['li'], dtype=object)

In [25]:
pipeMNB.predict([test])

array(['Product'], dtype='<U7')

In [27]:
pipeCNB.predict([test])

array(['Product'], dtype='<U7')

In [29]:
url = "https://www.tekshop.co.uk/laptops/home-laptops"
response = requests.get(url)
soup = BeautifulSoup(response.content,'html.parser')
products = []
productGrid = soup.find("ol",class_="products list items product-items") 
prs = productGrid.contents
test = prs[0].next_sibling
len(prs)

25

In [31]:
products=[]
for child in productGrid:
    if pipeMNB.predict([str(child)]) == ['Product']:
        products.append(child)

In [33]:
len(products)

25

In [35]:
products=[]
for child in productGrid:
    products.append(pipeCNB.predict([str(child)]))
len(products)

25

In [37]:
counter=0
identifiedProducts = []
for child in prs:
    if pipeSVC.predict([str(child)]) == ['Product']:
        identifiedProducts.append(child)
print(len(identifiedProducts))


12


In [149]:
identifiedProducts

[<div class="associated-product associated-product--default js-quickbuy-trigger" data-pid="78648" data-plp="true">
 <div class="associated-product__tile rounded-0">
 <span class="roundel product-overlay-bottom-right">
 <img alt="Stretch Fabric" src="https://cdn.yoursclothing.com/Images/ProductImages/c1dd9526-0d2a-45_YC_stretch-fabric-roundel_50x37.png"/>
 </span>
 <span class="roundel product-overlay-top">
 <img alt="2 Pack Sets" src="https://cdn.yoursclothing.com/Images/ProductImages/e71ff09e-0da4-45_THE-SET-roundels_50x50_regular_01.png"/>
 </span>
 <div class="associated-product__quickbuy" data-quickbuy-url="/product/api/getproductstocksforquickbuy?productid=78648&amp;productcode=144661">
 <div class="associated-product-quickbuy-content js-quickbuy-content" data-pid="78648">
 </div>
 </div>
 <div class="quickview-bar quickview-bar-mobile justify-content-end d-flex d-lg-none">
 <div alt="Quick Buy" class="quickview-bar__element p-0 d-flex" data-size="quickview">
 <div class="quickvie