# General Web Scraping

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

html_text = requests.get('https://community.infineon.com/?profile.language=en').text
soup = BeautifulSoup(html_text, 'lxml')

product_types = soup.find_all('a', class_='board-link')

data = []
        
for index, product_type in enumerate(product_types):
    # Find the title, body and time elements within the specific product type
    title = product_type.find_next('div', class_='subject').text.strip()
    body = product_type.find_next('div', class_='full-body body').text.strip()
    time = product_type.find_next('span', class_='time').text.strip().replace(' ', '')
    
    # link
    raw_link = product_type["href"]
    domain_link = 'https://community.infineon.com'
    link = domain_link + raw_link
    
    data.append({
        'product_types': product_type.text.strip(),
        'Title': title,
        'Body': body,
        'Time': time,
        'forum link': link
    })

df = pd.DataFrame(data)
df

Unnamed: 0,product_types,Title,Body,Time,forum link
0,IGBT,驱动芯片,"Hello , i need to know which tool can flash re...","May5,2024",https://community.infineon.com/t5/IGBT/bd-p/IGBT
1,TRAVEO™ T2G,S6J32HEL,"Hello , i need to know which tool can flash re...","May5,2024",https://community.infineon.com/t5/TRAVEO-T2G/b...
2,AURIX™,Performance comparison between TC399XX and TRA...,"Hello,\nDo you have any benchmark that compare...","May4,2024",https://community.infineon.com/t5/AURIX/bd-p/A...
3,Gate Driver ICs,Controlling AC load with two mosfets and 1ED31...,"Hi,I am investigating the possibility of contr...","May4,2024",https://community.infineon.com/t5/Gate-Driver-...
4,Intelligent Power Modules (IPM),Three Phase Sine Wave Power Supply using CIPOS...,"Hi all,I am an Embedded Engineer and I am look...","May4,2024",https://community.infineon.com/t5/Intelligent-...
5,Battery Management ICs,hello nice to see you all today eggsoil\n\n\n ...,hello nice to see you all today eggsoil\nhttps...,"May4,2024",https://community.infineon.com/t5/Battery-Mana...
6,PSoC™ 6,Encountering Issues with Programming and Debug...,Translated Content:\nBoard Model: Psoc6-evalua...,"May4,2024",https://community.infineon.com/t5/PSoC-6/bd-p/...
7,PSoC™ 6,使用PSoc 62系列板卡时遇到了无法烧录和调试的问题，似乎是flash的问题\n\n\n ...,我的板卡型号是Psoc6-evaluationkit-062S2，在我按下板卡上的MODE按...,"May4,2024",https://community.infineon.com/t5/PSoC-6/bd-p/...
8,MOTIX™ MCU,Question of Angel PLL observer (MOTIX FOC),"Hi all, I met some questions when I worked wit...","May3,2024",https://community.infineon.com/t5/MOTIX-MCU/bd...
9,USB superspeed peripherals,我有一些问题请教你,"Hello,I have a CYUSBKIT-003 board and I am stu...","May3,2024",https://community.infineon.com/t5/USB-superspe...


In [3]:
options = soup.find_all('li', class_='options')

print(f'Number of Types: {len(options)} \n')

data2 = []

for option in options:
    temp = option.text.strip()
    data2.append({'product_types': temp})
    
df_product = pd.DataFrame(data2)
df_product

Number of Types: 72 



Unnamed: 0,product_types
0,PSoC™ 6
1,Wi-Fi Combo
2,Nor Flash
3,USB low-full-high speed peripherals
4,FIRST Robotics Competition (FRC)
...,...
67,Power Management ICs
68,MOTIX™ MCU
69,Legacy microcontrollers
70,Battery Management ICs


# Extend our web scraping according to the product
For example, IGBT...

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the base URL for the first page
base_url = 'https://community.infineon.com/t5/IGBT/bd-p/IGBT'

data3 = []

# Define the number of pages you want to scrape
num_pages = 3  # Modify this according to your requirements

for page in range(num_pages):
    if page == 0:
        url = base_url
    else:
        # Construct the URL for the current page
        url = f'{base_url}/page/{page+1}'
    
    # Request HTML content
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'lxml')
    
    # Find all product types on the current page
    product_types = soup.find_all('a', class_='board-link')
    
    for product_type in product_types:
        # Find the title, body, and time elements within the specific product type
        title = product_type.find_next('div', class_='subject').text.strip()
        body = product_type.find_next('div', class_='full-body body').text.strip()
        time = product_type.find_next('span', class_='time').text.strip().replace(' ', '')
        
        data3.append({
            'product_types': product_type.text.strip(),
            'Title': title,
            'Body': body,
            'Time': time
        })

# Create DataFrame
df_IGBT = pd.DataFrame(data3)
df_IGBT


Unnamed: 0,product_types,Title,Body,Time
0,IGBT,驱动芯片,"Hello, \nI am learning about all the infineon'...","May5,2024"
1,IGBT,Differents housing for igbt modules,"Hello, \nI am learning about all the infineon'...","Apr30,2024"
2,IGBT,Question about Infineon’s Simulation Models,Hello! I am trying to use the PLECS thermal mo...,"May2,2024"
3,IGBT,Eval-M1-CM610N3,Regarding the principle provided in the thread...,"Apr30,2024"
4,IGBT,"IKCM30F60GD , what is the max output current @...","IFCM20U65GD IPM module has normal VFO pin, nor...","Apr20,2024"
5,IGBT,IFCM20U65GD IPM module inexplicably shuts down...,"IFCM20U65GD IPM module has normal VFO pin, nor...","Apr30,2024"
6,IGBT,Automotive Grade IGBT Module Reliability,"Hello, I didn't find any quality report about ...","Apr26,2024"
7,IGBT,IGBT model\n\n\n Solved,您好，各位工程师，我们在测试3KW功率板的时候出现炸机问题，我们猜测是过电压或过电流导致，如...,"Apr27,2024"
8,IGBT,3KW炸机问题,您好，各位工程师，我们在测试3KW功率板的时候出现炸机问题，我们猜测是过电压或过电流导致，如...,"Apr24,2024"
9,IGBT,Eval-M1-CM610N3 评估板电源\n\n\n Solved,"hi\n工程师\nU2 D 和S直接之间的线宽我用的120mil, 四个S我也是用120mi...","Apr17,2024"


# Explore Input Data

According to different product, we have different input data. To simplify the process, we could use "Instant Data Scraper" chrome extension to do data scraping from the community website. (https://chromewebstore.google.com/detail/instant-data-scraper/ofaokhiedipichpaobibbnahnkdoiiah)

In [5]:
df1 = pd.read_csv('data/PSoC6.csv')
df2 = pd.read_csv('data/Wi-Fi Combo.csv')
df3 = pd.read_csv('data/Nor Flash.csv')
df4 = pd.read_csv('data/USB low-full-high speed peripherals.csv')
df5 = pd.read_csv('data/MOSFET.csv')

In [6]:
df1.loc[:,['board-link','subject-link','truncated-body']].head() #PSoC6

Unnamed: 0,board-link,subject-link,truncated-body
0,PSoC™ 6,使用PSoc 62系列板卡时遇到了无法烧录和调试的问题，似乎是flash的问题,我的板卡型号是Psoc6-evaluationkit-062S2，在我按下板卡上的MODE按...
1,PSoC™ 6,PSoC6 CY8CPROTO-063,"Hello, good afternoon, I am going to work on a..."
2,PSoC™ 6,Encountering Issues with Programming and Debug...,Translated Content:\nBoard Model: Psoc6-evalua...
3,PSoC™ 6,SCB Managing Slave Select Peripheral Lines,- Device Configurator 3.10.0.6117\n- 7e6892ee1...
4,PSoC™ 6,How to properly use dma and spi together with ...,We are having a problem communicating with two...


In [7]:
df2.loc[:,['board-link','subject-link','truncated-body']].head() #Wi-Fi Combo

Unnamed: 0,board-link,subject-link,truncated-body
0,Wi-Fi Combo,Malloc is thread safe ?,"Hello,\nI'm using WICED STUDIO 6.6. and CYW943..."
1,Wi-Fi Combo,CYW943907 - Amazon FreeRTOS OTA support,"Hi, Per Amazon doc, CYW943907AEVAL1F is not su..."
2,Wi-Fi Combo,how to enable or disable the save restore feat...,when I read the register of CHIPCOMMON_SR_CONT...
3,Wi-Fi Combo,HTTPS speed problem,Good afternoon. I am facing HTTPS speed issue ...
4,Wi-Fi Combo,Disable dns server.,"Hi, I have a Laird Sterling EWB. What I try to..."


In [8]:
df3.loc[:,['board-link','subject-link','truncated-body']].head() #Nor Flash

Unnamed: 0,board-link,subject-link,truncated-body
0,Nor Flash,how to use or unprotect the highest address se...,"Hi,\nIam using a S70GL02GT11FHA010 NOR Flash...."
1,Nor Flash,S25FL256LAGNFM010 material specifications,I would like to know what the potting compound...
2,Nor Flash,S29AL016J70TFN020 Thermal Data?,"Hello,\nWe would like to use this memory: S29A..."
3,Nor Flash,Where is the file ---- slld_fll_256l.h,A member from Infineon posted me a zip file re...
4,Nor Flash,S29JL032J60TFI010 of Product Status,How is the production of S29JL032J60TFI010 pro...


In [9]:
df4.loc[:,['board-link','subject-link','truncated-body']].head() #USB low-full-high speed peripherals

Unnamed: 0,board-link,subject-link,truncated-body
0,USB low-full-high speed peripherals,"CY7C65215, configuration image CRC","From a linux system, I want to read back the c..."
1,USB low-full-high speed peripherals,Arming of bulk/interrupt out endpoint in fx2lp,"Hello,I am stuck at a point,in my application ..."
2,USB low-full-high speed peripherals,Pinout error on documentation of CY7C6514D,"Hi Infineon Community,\nI am reviewing the sec..."
3,USB low-full-high speed peripherals,CY7C65210 TID Number,"Hi,\nWhat is CY7C65210 TID number?\nBR\nEason"
4,USB low-full-high speed peripherals,Setting up isochronous out endpoint in fx2lp,"Hello,Can someone point me to examples codes f..."


In [10]:
df5.loc[:,['subject-link','truncated-body']].head() # MOSFET # No feature 'board-link'

Unnamed: 0,subject-link,truncated-body
0,Double Pulse Test,"In the datasheet, a dual pulse test circuit wa..."
1,TCC data,"Hello,\nI am reaching out to request TCC data ..."
2,Cu Clip in Automotive Mosfet,Hi\n \nDoes infineon has Cu Clip technology in...
3,SiC IMBG120R350M1HXTMA1 sense pin unconnected,"Hello together,\nI'm planning to use a SiC IMB..."
4,EVAL Inverter,I'm playing around with the EVAL_3K3W_TP_PFC_S...
