In [120]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd #read files
from bs4 import BeautifulSoup
import requests

In [122]:
df = pd.read_csv('/Users/Tico/Documents/Masters/LaMona/ModelTraining/combinedElements.csv')

In [124]:
df

Unnamed: 0.1,Unnamed: 0,html,name,price,category
0,0,"<div class=""shopify-section"" id=""shopify-secti...",,,Not Product
1,1,"<div data-section-type=""cart-drawer""><div clas...",,,Not Product
2,2,"<div class=""cart-summary cart-summary--empty c...",,,Not Product
3,3,"<div aria-live=""polite"" class=""cart-summary__i...",,,Not Product
4,4,"<div class=""cart-summary__header cart-summary_...",,,Not Product
...,...,...,...,...,...
2713,2713,"<li class=""grid__item"">\n<link href=""//thecabi...",2 Drawer TV Stand M1,£239.59 GBP,Product
2714,2714,"<li class=""grid__item"">\n<link href=""//thecabi...",Mid Quad Bookcase M1,£0.00 GBP,Product
2715,2715,"<li class=""grid__item"">\n<link href=""//thecabi...",Low Quad Bookcase M1,£181.76 GBP,Product
2716,2716,"<li class=""grid__item"">\n<link href=""//thecabi...",Low 6 Drawer Chest M2,£364.62 GBP,Product


In [126]:
X = df['html']
y = df['category']

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=22)

In [130]:
#tdidf turns textual data into numerical format
pipeMNB = Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])
pipeCNB = Pipeline([('tfidf',TfidfVectorizer()),('clf',ComplementNB())])
pipeSVC = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [132]:
#training mnb
pipeMNB.fit(X_train, y_train)
predictMNB = pipeMNB.predict(X_test)
print(f"MNB score is: "+str(accuracy_score(y_test,predictMNB)))

MNB score is: 0.9852941176470589


In [134]:
#training cnb
pipeCNB.fit(X_train, y_train)
predictCNB = pipeCNB.predict(X_test)
print(f"CNB score is: "+str(accuracy_score(y_test,predictCNB)))

CNB score is: 0.9908088235294118


In [136]:
#training svc
pipeSVC.fit(X_train, y_train)
predictSVC = pipeSVC.predict(X_test)
print(f"SVC score is: "+str(accuracy_score(y_test,predictSVC)))



SVC score is: 1.0


In [138]:
#check accuracy of models
print(classification_report(y_test,predictSVC))

              precision    recall  f1-score   support

 Not Product       1.00      1.00      1.00       235
     Product       1.00      1.00      1.00       309

    accuracy                           1.00       544
   macro avg       1.00      1.00      1.00       544
weighted avg       1.00      1.00      1.00       544



In [140]:
test = """<li class="grid__item">
<link href="//thecabinetshop.co.uk/cdn/shop/t/18/assets/component-rating.css?v=24573085263941240431657786494" media="all" rel="stylesheet" type="text/css">
<div class="card-wrapper underline-links-hover">
<div class="card card--standard card--media" style="--ratio-percent: 100.0%;">
<div class="card__inner color-background-2 gradient ratio" style="--ratio-percent: 100.0%;"><div class="card__media">
<div class="media media--transparent media--hover-effect">
<img alt="Made to Measure Door" class="motion-reduce" height="1600" sizes="(min-width: 1600px) 367px, (min-width: 990px) calc((100vw - 130px) / 4), (min-width: 750px) calc((100vw - 120px) / 3), calc((100vw - 35px) / 2)" src="//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469&amp;width=533" srcset="//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469&amp;width=165 165w,//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469&amp;width=360 360w,//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469&amp;width=533 533w,//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469&amp;width=720 720w,//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469&amp;width=940 940w,//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469&amp;width=1066 1066w,//thecabinetshop.co.uk/cdn/shop/products/SHAKERDOORS1.png?v=1659956469 1600w" width="1600"/>
</div>
</div><div class="card__content">
<div class="card__information">
<h3 class="card__heading">
<a aria-labelledby="StandardCardNoMediaLink-template--14560278282294__product-grid-4784571809846 NoMediaStandardBadge-template--14560278282294__product-grid-4784571809846" class="full-unstyled-link" href="/products/bespoke-doors" id="StandardCardNoMediaLink-template--14560278282294__product-grid-4784571809846">
                Made to Measure Door
              </a>
</h3>
</div>
<div class="card__badge bottom left"></div>
</div>
</div>
<div class="card__content">
<div class="card__information">
<h3 class="card__heading h5" id="title-template--14560278282294__product-grid-4784571809846">
<a aria-labelledby="CardLink-template--14560278282294__product-grid-4784571809846 Badge-template--14560278282294__product-grid-4784571809846" class="full-unstyled-link" href="/products/bespoke-doors" id="CardLink-template--14560278282294__product-grid-4784571809846">
              Made to Measure Door
            </a>
</h3>
<div class="card-information"><span class="caption-large light"></span>
<div class="price">
<div class="price__container"><div class="price__regular">
<span class="visually-hidden visually-hidden--inline">Regular price</span>
<span class="price-item price-item--regular">
        From ¬£11.52 GBP
      </span>
</div>
<div class="price__sale">
<span class="visually-hidden visually-hidden--inline">Regular price</span>
<span>
<s class="price-item price-item--regular">
</s>
</span><span class="visually-hidden visually-hidden--inline">Sale price</span>
<span class="price-item price-item--sale price-item--last">
        From ¬£11.52 GBP
      </span>
</div>
<small class="unit-price caption hidden">
<span class="visually-hidden">Unit price</span>
<span class="price-item price-item--last">
<span></span>
<span aria-hidden="true">/</span>
<span class="visually-hidden">¬†per¬†</span>
<span>
</span>
</span>
</small>
</div></div>
</div>
</div><div class="card__badge bottom left"></div>
</div>
</div>
</div>
</link></li>
"""

In [142]:
pipeSVC.predict([test])


array(['Product'], dtype=object)

In [144]:
pipeMNB.predict([test])

array(['Product'], dtype='<U11')

In [146]:
pipeCNB.predict([test])

array(['Product'], dtype='<U11')

In [148]:
url = "https://www.yoursclothing.co.uk/collections/crops-shorts"
response = requests.get(url)
soup = BeautifulSoup(response.content,'html.parser')
products = []
divs = soup.find_all('div')
for script in soup("script"):
    script.decompose()
for noscript in soup("noscript"):
    noscript.decompose()

for style in soup("style"):
    style.decompose()

for svg in soup("svg"):
    svg.decompose()

for meta in soup("meta"):
    meta.decompose() 

In [150]:
products=[]
for child in divs:
    if pipeMNB.predict([str(child)]) == ['Product']:
        products.append(child)

In [151]:
len(products)

1620

In [156]:
products=[]
for child in divs:
    if pipeCNB.predict([str(child)]) == ['Product']:
        products.append(child)
len(products)

4806

In [158]:
products=[]
for child in divs:
    if pipeSVC.predict([str(child)]) == ['Product']:
        products.append(child)
len(products)

5946