In [1]:
import requests
from bs4 import BeautifulSoup
import os
import csv


In [39]:
# Function to scrape website and extract tags and content
def scrape_website(url):
    # Send a GET request to the URL
    response = requests.get(url)


    # Check if CSV file exists, if not create one
    csv_filename = 'dataset.csv'
    csv_exists = os.path.isfile(csv_filename)
    fieldnames = ['tag_content']


    # Check if request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the body tag
        div_tag = soup.find('div', class_='text')

        # Initialize a list to store tags and their content
        tags_and_content = []

        with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            # If CSV file doesn't exist, write the header
            if not csv_exists:
                writer.writeheader()

            for tag in div_tag.find_all(recursive=False):
                tag_name = tag.name
                tag_content = tag.text.strip()
                tag_content = tag_content.replace(u'\xa0', u' ')

                if tag_name == 'p':
                    writer.writerow({'tag_content': tag_content})
    else:
        # Print an error message if request was unsuccessful
        print("Failed to retrieve webpage")
        return None

### Getting the urls to induvidual pages

In [40]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.geeksforgeeks.org/computer-network-tutorials/"

# Send a GET request to the webpage
response = requests.get(url)

# Check if request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <li> tags
    li_tags = soup.find_all('li')

    # Initialize a list to store href values
    href_values = []

    # Iterate through each <li> tag
    for li_tag in li_tags:
        # Find all <a> tags inside the <li> tag
        anchor_tags = li_tag.find_all('a')

        # Iterate through each <a> tag
        for anchor_tag in anchor_tags:
            # Get the value of the href attribute
            href = anchor_tag.get('href')

            # Append href value to the list
            if href:
                href_values.append(href)

    # Print the href values
    print("List of href values:")

    #Manually selecting required tags:
    href_values = href_values[1909:2080]

    for href_value in href_values:
        print(href_value)
else:
    print("Failed to fetch the webpage.")

List of href values:
https://www.geeksforgeeks.org/basics-computer-networking/
https://www.geeksforgeeks.org/the-internet-and-the-web/
https://www.geeksforgeeks.org/internet-and-web-programming-behind-the-scenes/
https://www.geeksforgeeks.org/the-new-internet-internet-of-everything/
https://www.geeksforgeeks.org/unknown-facts-of-networking/
https://www.geeksforgeeks.org/computer-network-network-goals/
https://www.geeksforgeeks.org/line-configuration-computer-networks/
https://www.geeksforgeeks.org/transmission-modes-computer-networks/
https://www.geeksforgeeks.org/types-transmission-media/
https://www.geeksforgeeks.org/computer-network-difference-unicast-broadcast-multicast/
https://www.geeksforgeeks.org/introduction-to-basic-networking-terminology/
https://www.geeksforgeeks.org/network-topologies-computer-networks/
https://www.geeksforgeeks.org/computer-network-types-area-networks-lan-man-wan/
https://www.geeksforgeeks.org/telecom-networks/
https://www.geeksforgeeks.org/computer-netwo

In [41]:
# href_values.index("https://www.geeksforgeeks.org/type-c-port-in-computer-network/")

In [42]:
href_values = list(set(href_values))
len(href_values)

70

In [43]:
from tqdm import tqdm

for url in tqdm(href_values, desc="Scraping Websites", unit=" website"):
    scrape_website(url)

Scraping Websites: 100%|██████████| 70/70 [00:31<00:00,  2.21 website/s]


### Data Cleaning

In [44]:
import csv

def clean_csv(input_file, output_file):

    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)

        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile)
            for row in reader:

                cleaned_row = [cell.strip().split() for cell in row if cell.strip()]
                if len(cleaned_row)!=0 and len(cleaned_row[0]) >= 10:
                    writer.writerow(row)

# Example usage:
input_file = 'dataset.csv'
output_file = 'dataset_cleaned.csv'
clean_csv(input_file, output_file)

## Javapoint

In [17]:
# Send a GET request to the URL
response = requests.get("https://www.javatpoint.com/computer-network-tutorial")

href_values = []

# Check if request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the body tag
    div_tag = soup.find('div', id = 'menu')
    tags = div_tag.find_all('div', class_='leftmenu')
    for li_tag in tags:
        # Find all <a> tags inside the <li> tag
        anchor_tags = li_tag.find_all('a')

        # Iterate through each <a> tag
        for anchor_tag in anchor_tags:
            # Get the value of the href attribute
            href = anchor_tag.get('href')

            # Append href value to the list
            if href:
                href_values.append(href)

In [14]:
href_values

['computer-network-tutorial',
 'computer-network-introduction',
 'computer-network-features',
 'computer-network-architecture',
 'computer-network-components',
 'types-of-computer-network',
 'computer-network-topologies',
 'computer-network-transmission-modes',
 'computer-network-models',
 'osi-model',
 'computer-network-tcp-ip-model',
 'computer-network-digital-transmission',
 'transmission-media',
 'guided-transmission-media',
 'unguided-transmission-media',
 'multiplexing-in-computer-network',
 'computer-network-switching',
 'computer-network-switching-modes',
 'computer-network-switching-techniques',
 'data-link-layer',
 'computer-network-error-detection',
 'computer-network-error-correction',
 'data-link-controls',
 'network-layer',
 'network-addressing',
 'computer-network-routing',
 'network-layer-protocols',
 'computer-network-routing-algorithm',
 'distance-vector-routing-algorithm',
 'link-state-routing-algorithm',
 'computer-network-transport-layer',
 'computer-network-transp

In [19]:
from urllib.parse import urljoin

response =

BeautifulSoup(response.text, 'html.parser')

 <!DOCTYPE html>
<html lang="en"><head>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-BMVLE5WY82"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-BMVLE5WY82');
</script>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>Computer Network Topology: What It is and Types - javatpoint</title><link href="https://static.javatpoint.com/images/favicon2.png" rel="SHORTCUT ICON"/>
<link async="" href="https://static.javatpoint.com/link.css?v=6.0" rel="stylesheet" type="text/css"/><link href="https://clients1.google.com" rel="dns-prefetch"/><link href="https://static.javatpoint.com" rel="dns-prefetch"/><link href="https://googleads.g.doubleclick.net" rel="dns-prefetch"/><link href="https://www.google.com" rel="dns-prefetch"/><link href="https://feedify.net" rel="dns-prefetch"/><meta content="#4CAF50" name="theme-color"><meta content="Computer N

In [36]:
def scrape_website(url):
    # Send a GET request to the URL
    response = requests.get(urljoin("https://www.javatpoint.com/computer-network-tutorial",url))


    # Check if CSV file exists, if not create one
    csv_filename = 'dataset_javapoint.csv'
    csv_exists = os.path.isfile(csv_filename)
    fieldnames = ['tag_content']


    # Check if request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # print(soup)

        # Find the body tag
        div_tag = soup.find('div', id='city')

        # Initialize a list to store tags and their content
        tags_and_content = []

        with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            # If CSV file doesn't exist, write the header
            if not csv_exists:
                writer.writeheader()

            for tag in div_tag.find_all(recursive=True):
                tag_name = tag.name
                tag_content = tag.text.strip()
                tag_content = tag_content.replace(u'\xa0', u' ')

                if tag_name == 'p' or tag_name == 'li':
                    writer.writerow({'tag_content': tag_content})
    else:
        # Print an error message if request was unsuccessful
        print("Failed to retrieve webpage")
        return None

In [37]:
for url in href_values:
  scrape_website(url)

In [38]:
import csv

def clean_csv(input_file, output_file):

    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)

        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile)
            for row in reader:

                cleaned_row = [cell.strip().split() for cell in row if cell.strip()]
                if len(cleaned_row)!=0 and len(cleaned_row[0]) >= 10:
                    writer.writerow(row)

# Example usage:
input_file = 'dataset_javapoint.csv'
output_file = 'dataset_javapoint_cleaned.csv'
clean_csv(input_file, output_file)

## Merging two files

In [45]:
import pandas as pd

# Read the CSV files
df1 = pd.read_csv("dataset_cleaned.csv")
df2 = pd.read_csv("dataset_javapoint_cleaned.csv")

# Merge the two dataframes
merged_df = pd.concat([df1, df2], ignore_index=True)

# Write the merged dataframe to a new CSV file
merged_df.to_csv("merged_dataset.csv", index=False)

print("Merged dataset saved successfully!")

Merged dataset saved successfully!
