In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd


In [None]:
def click_btn(clickable):
    for btn in clickable:
        try:
            btn.click()
        except:
            pass

### Step 1:

How to get the details from all these diaries? <br>
I started with a naive approach. Trying to load 10K of diaries using the website UI wasn't successful. So I carefully examined the query that was sent when more diaries were added. Using Postman (API requests tool), I discovered that the query has a limit of 2K diaries and a starting point and count of how many rows the server should return. Added to it is the fact that I can always be blocked from the website side.

With those conclusions I decided to download the entire HTML pages that the query returned with 2K chunks, using it I scraped 10K links of grow diaries into my first database

In [None]:
startpoint = 0
count = 2000
prefix_url = "https://growdiaries.com/explore"

service = Service('./chromedriver.exe')
op = webdriver.ChromeOptions()
op.add_argument('--headless')
elements = []
url_df = pd.DataFrame()

In [None]:
driver = webdriver.Chrome(service=service, options=op)
while startpoint < 10000:
    query = f"?action=loadpage&category=all&tags=harvested&start={startpoint}&count={count}"
    final_url = prefix_url + query
    driver.get(final_url)
    elements= driver.find_elements(By.XPATH, "//a[@class='name']")
    startpoint += count
    for element in elements:
        temp_df = pd.DataFrame({"DiaryName": [element.text], "Url": [element.get_attribute('href')]})
        url_df = pd.concat([url_df, temp_df], ignore_index=True)

driver.quit()

In [None]:
url_df.to_csv("Diary Links.csv", index=False)

### Step 2:

Once I have the links for my desired database, I need to start collecting raw data from each diary. As the first step toward real data, my decision was to include the following as features: <br>
diary_name<br> strain<br> strains_company<br> light_watt<br> nutrients<br> watering<br> soil<br> germination<br> grow_techniques<br>grow_room_size<br> weeks_to_harvest<br> num_of_plants<br> likes<br>comments<br> views<br> bud_dry_weight<br> bud_wet_weight

Using likes, comments, and views may increase/decrease the reliability of the data point.<br>
The method used to scrape the data is XPATH. It has conditional filtering and I found it the perfect way to reach every selector I needed.

Highly recommend: https://devhints.io/xpath for understanding and creating XPath queries

<b>Note: To make it esaier, I saved each time 1K of data in a file</b>


In [None]:
url_df = pd.read_csv("Diary Links.csv")

In [None]:
driver = webdriver.Chrome(service=service, options=op)
count = 2000
const = 1000
while count < 10000:
    det_df = pd.DataFrame()
    for j, url in enumerate(url_df["Url"][count:count+const]):
        driver.get(url)
        driver.maximize_window()
        clickable = driver.find_elements(By.XPATH, "//div[@class='btn_less']")
        click_btn(clickable)
        report_items = driver.find_elements(By.XPATH, "//div[contains(@class,'report_items')]//*[@class='info']")
        outcome = driver.find_elements(By.XPATH, "//div[contains(@class,'parameters_item')]")
        likes = driver.find_element(By.XPATH, "//div[@class='report_statistic']//div[.//@class='icon-leaf-like']")
        comments = driver.find_element(By.XPATH, "//div[@class='report_statistic']//div[.//@class='icon comment']")
        views = driver.find_element(By.XPATH, "//div[@class='report_statistic']//div[.//@class='icon eye']")
        d = {"diary_name": url_df['DiaryName'].loc[count+j], "strain": "", "strains_company":"", "light_watt": "", "nutrients": "", "watering": "", "soil": "", "germination": "", 'grow_techniques': "",
        "weeks_to_harvest": "", "num_of_plants": "","likes": likes.text, "comments": comments.text,"views": views.text,"bud_dry_weight": "", "bud_wet_weight": ""}

for i, item in enumerate(report_items):
    try:
        text = item.text
        texts = text.split("\n")
        if i == 0:
            d["strain"] = texts[0]
            d['strains_company'] = texts[1]
        elif "LED" in text:
            d['light_watt'] += text.replace("\n", " ") + ","
        elif "Nutrients" in text:
            d['nutrients'] += texts[0] +","
        elif "Watering" in text:
            d['watering'] = texts[0]
        elif "Soil" in text or "Grow" in text:
            d['soil']+= text.replace("\n", " ")+","
        elif "Germination" in text:
            d['germination'] = texts[0]
        elif text.find("Week") > 0 and len(texts[0]) > 1:
            d['grow_techniques'] += texts[0] + ","
    except:
        pass
for i, details in enumerate(outcome):
    try:
        text = details.text
        texts = text.split("\n")
        if i == 0:
            d["weeks_to_harvest"] = texts[1]
        elif "BUD WET WEIGHT" in text:
            d['bud_wet_weight']= texts[1]
        elif "BUD DRY WEIGHT" in text:
            d['bud_dry_weight']= texts[1]
        elif "NUMBER OF PLANTS HARVESTED" in text:
            d['num_of_plants'] = texts[1].split(" ")[0]
        elif "TOTAL LIGHT POWER" in text:
            d['light_watt'] = texts[1]
    except:
        pass
det_df = pd.concat([det_df,pd.DataFrame(d, index=[0])], ignore_index=True)
det_df.to_csv(f"Data_{count}_{count+const}.csv", index=False)
count += const

driver.quit()

Connect all files to one CSV file:

In [None]:
count = 0
const = 1000
full_df = pd.DataFrame()
while count < 10000:
    full_df = pd.concat([full_df, pd.read_csv(f"Data_{count}_{count+const}.csv")])
    count += const
full_df.to_csv("GrowDiariesRowData.csv", index=False)