In [1]:
import requests
import os
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd

Getting Relevant Information from a single Wikihow page

In [2]:
# Function to obtain relevant data from wikihow pages...
def get_info(url, main_cat):

    print(url)
    response = requests.get(url)
    # Code to Parse the HTML code with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

  
    main_heading = soup.find('h1').text

    # Obtaining all the subheadings from the page
    sub_headings = []
    for h3 in soup.find_all('h3'):
        sub_heading = h3.find('span', class_='mw-headline')
        if sub_heading:
            sub_headings.append(sub_heading.text.split('.')[0].strip())
    sub_headings = ', '.join(sub_headings)

    # Obtaining sub-categories
    category = []
    for ul in soup.find_all('ul', class_='breadcrumbs'):
        li_elements = ul.find_all('li')
        for li in li_elements:
            a_tag = li.find('a')
            if a_tag:
                title_attribute = a_tag.text
                if title_attribute != "Categories" and title_attribute != main_cat:
                    category.append(title_attribute)
    category = ', '.join(np.unique(category))

    # Obtaining other useful info such as ratings, Number of authors, etc.
    other_info = []
    ratings = ''
    for div in soup.find_all('div'):
        info = div.find('span', class_="sp_text_data")
        helpful_ratings = div.find('div', class_='sp_helpful_rating_count')
        if info:
            other_info.append(div.find('span', class_="sp_text_data").text)
        if helpful_ratings:
            ratings = helpful_ratings.text

    other_info = np.unique(np.array(other_info))
    no_of_coauthors = other_info[0]
    no_of_views = other_info[1]
    last_updated = other_info[2]

    if len(ratings) != 0:
        no_of_votes = int(ratings.split('-')[0].strip().split(' ')[0])
        rating = ratings.split('-')[1].strip() 
    else:
        no_of_votes = 0
        rating = ''

    return [main_heading, sub_headings, main_cat, category, no_of_coauthors, no_of_views, last_updated, no_of_votes, rating] 

Applying this function to a main page in wikihow where manyother similar urls are found

In [4]:
with open('unwanted_urls.txt', 'r') as f:
    unwanted_urls = [line.rstrip() for line in f]

unwanted_urls

['https://www.wikihow.com/wikiHow:Jobs',
 'https://www.wikihow.com/wikiHow:Contact-Us',
 'https://www.wikihow.com/wikiHow:Terms-of-Use',
 'https://www.wikihow.com/wikiHow:Privacy-Policy',
 'https://www.wikihow.com/wikiHow:Contribute',
 'https://www.wikihow.com/Newsletters',
 'https://www.wikihow.com/Course/Explore?utm_source=wikihow&utm_medium=banner&utm_campaign=coursebanner',
 'https://www.wikihow.com/Course/Explore',
 'https://www.wikihow.com/Guides',
 'https://www.wikihow.com/Tech-Help-Pro',
 'https://www.wikihow.com/wikiHow:Videos-wikiHowPro',
 'https://www.wikihow.com/Pro',
 'https://www.wikihow.com/#wh-dialog-pro',
 'https://www.wikihow.com/#wh-dialog-login',
 'https://www.wikihow.com/Tech-Help-Pro',
 'https://www.wikihow.com/wikiHow:About-wikiHow',
 'https://www.wikihow.com/Category',
 'https://www.wikihow.com/Quizzes',
 'https://www.wikihow.com/Main-Page',
 'https://www.wikihow.com/Log-In',
 'https://www.wikihow.com/About-wikiHow',
 'https://www.wikihow.com/Experts',
 'https:/

In [5]:
from urllib.parse import urljoin

sub_cats = ['Computers and Electronics', 'Audio', 'Basic Computer Skills', 'Blogs', 'Buying Consumer Electronics', 'Calculators', 'Computer Networking', 'Computer Peripherals',
            'Computers', 'Consumer Electronics', 'Data Recovery', 'Global Positioning System (GPS)', 'Hacks', 'Hardware', 'Information Technology Careers', 'Internet', 'Operating Systems',
            'Maintenance and Repair', 'Portable Media Players', 'Robots', 'Software', 'Technology and Privacy', 'Technology Hacks', 'Telephones', 'Television', 'Video', 'Wireless Technology']

wikihow_data = []
for sub_cat in sub_cats:
    cat = "-".join(sub_cat.split(' '))
    url = f"https://www.wikihow.com/Category:{cat}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    urls = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        full_url = urljoin('https://www.wikihow.com/', href)

        if full_url == "https://www.wikihow.com/":
            continue
        elif any(full_url.startswith(pattern) for pattern in unwanted_urls):
            continue
        elif full_url.startswith('https://www.wikihow.com/'):
            urls.append(full_url)

    urls = np.unique(urls)

    for url in urls:
        wikihow_data.append(get_info(url, sub_cat))

https://www.wikihow.com/Add-Rows-in-Excel-with-a-Formula
https://www.wikihow.com/Att-Account-Number
https://www.wikihow.com/Blur-Zoom-Background
https://www.wikihow.com/Buy-Bitcoins
https://www.wikihow.com/Bypass-Screen-Time-Passcode
https://www.wikihow.com/Change-Caller-ID-on-iPhone
https://www.wikihow.com/Change-Email-Password-on-iPhone
https://www.wikihow.com/Change-Hotspot-Name
https://www.wikihow.com/Change-Mouse-Polling-Rate
https://www.wikihow.com/Change-the-Language-on-Your-Computer
https://www.wikihow.com/Charge-Your-Device-Faster
https://www.wikihow.com/Charge-a-Battery-Without-a-Charger
https://www.wikihow.com/Charge-a-Fitpro-Watch
https://www.wikihow.com/Charge-a-Kindle-Paperwhite
https://www.wikihow.com/Chatgpt-Without-Phone-Number
https://www.wikihow.com/Cheat-a-Step-Counter-on-a-Phone
https://www.wikihow.com/Check-Browsing-History-on-a-WiFi-Router
https://www.wikihow.com/Check-Your-Computer%27s-System-Information
https://www.wikihow.com/Check-Your-Hard-Disk-Space
https:/

In [6]:
wikihowdf = pd.DataFrame(wikihow_data, columns=['Main Heading', 'Sub Headings', 'Main category',
                                                 'Sub-category','Number of Co-Authors', 'Number of Views', 
                                                 "Last Updated date", "Number of Votes", "Rating"])

In [7]:
wikihowdf.head()

Unnamed: 0,Main Heading,Sub Headings,Main category,Sub-category,Number of Co-Authors,Number of Views,Last Updated date,Number of Votes,Rating
0,How to Add Rows in Excel with a Formula,"Adding Rows to a Sheet with a Macro, Adding Fo...",Computers and Electronics,,4,57896,"May 13, 2022",0,
1,Locate Your AT&T Account Number for Prepaid & ...,"Finding a Postpaid AT&T Account Number, Findin...",Computers and Electronics,"Phone Plans, Telephones",181,3,"February 16, 2024",0,
2,Blurring Your Background in Zoom: Easy Steps f...,"Using a PC or Mac, Using the Zoom Mobile App, ...",Computers and Electronics,,21373,4,"September 28, 2023",0,
3,How to Buy Bitcoins: Easy & Trustworthy Option...,"Using Payment Apps, Using Stock Trading Apps, ...",Computers and Electronics,,594097,87,"February 3, 2024",0,
4,How to Bypass Your Screen Time Passcode: Reset...,"Reset Your Own Passcode, Reset Your Child’s Pa...",Computers and Electronics,"IPhone, Smartphones, Telephones",6,80210,"February 7, 2024",0,


In [8]:
#code Save the DataFrame to a CSV file
csv_filename = 'wikihow_data.csv'
wikihowdf.to_csv(csv_filename, index=False)  # Set index=False to exclude the index column in the CSV file
print(f'DataFrame has been saved to {csv_filename}')

DataFrame has been saved to wikihow_data.csv


In [9]:
import networkx as nx

In [10]:

csv_filename = 'wikihow_data.csv'
wikihowdf = pd.read_csv(csv_filename)

print(wikihowdf.head())

G_wikihow = nx.Graph()

nodes_column = 'Main Heading' 
G_wikihow.add_nodes_from(wikihowdf[nodes_column])

#Code to Calculate the total number of nodes in the graph
total_nodes = nx.number_of_nodes(G_wikihow)

print(f'Total number of nodes in the graph: {total_nodes}')

                                        Main Heading  \
0            How to Add Rows in Excel with a Formula   
1  Locate Your AT&T Account Number for Prepaid & ...   
2  Blurring Your Background in Zoom: Easy Steps f...   
3  How to Buy Bitcoins: Easy & Trustworthy Option...   
4  How to Bypass Your Screen Time Passcode: Reset...   

                                        Sub Headings  \
0  Adding Rows to a Sheet with a Macro, Adding Fo...   
1  Finding a Postpaid AT&T Account Number, Findin...   
2  Using a PC or Mac, Using the Zoom Mobile App, ...   
3  Using Payment Apps, Using Stock Trading Apps, ...   
4  Reset Your Own Passcode, Reset Your Child’s Pa...   

               Main category                     Sub-category  \
0  Computers and Electronics                              NaN   
1  Computers and Electronics          Phone Plans, Telephones   
2  Computers and Electronics                              NaN   
3  Computers and Electronics                              NaN   
4