In [58]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup


In [60]:
linksdataset=pd.read_csv(r"C:\Users\Otman-INFO\Downloads\linksdataset.xls")
linksdataset.head()

Unnamed: 0,Title,Link
0,Cardboard and Waste Recycling: 3 Genius Recycl...,https://www.instructables.com/Cardboard-and-Wa...
1,Recycling PET Plastic to Filament,https://www.instructables.com/Recycling-PET-Pl...
2,Recycling Bin for Kids Made From Recycled Card...,https://www.instructables.com/Recycling-Bin-fo...
3,Cardboard to Plant Pots by Recycling Waste Pap...,https://www.instructables.com/Cardboard-to-Pla...
4,System of Recycling Trash (SORT),https://www.instructables.com/System-of-Recycl...


## Extract content of each Article

#### Option 1 : you can use this code in jupyterNotbook

In [10]:
def extract_content(url):
    
    try:
        # Step 2: Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses (4xx and 5xx)

        # Step 3: Parse the content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Step 4: Extract all text from the page
        text = soup.get_text(separator='\n')  # Use a separator to keep readability
        
        return text

    except requests.exceptions.RequestException as e:
        return print(f"An error occurred: {e}")

#### Option 2 : you can use this code in Google Colab

In [None]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup

# Assuming your full DataFrame looks like this (with more rows in the 'Link' column)

data = pd.read_csv('/content/drive/MyDrive/linksdata/linksdataset.xls')

# Get all the links from the 'Link' column
urls_to_scrape = data['Link'].tolist()

async def fetch_url(session, url):
    try:
        async with session.get(url) as response:
            html = await response.text()
            soup = BeautifulSoup(html, 'html.parser')
            text = soup.get_text(separator='\n')
            return text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

async def fetch_all(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_url(session, url) for url in urls]
        return await asyncio.gather(*tasks)


text_results = await fetch_all(urls_to_scrape)

# Add the scraped content back to the DataFrame
data['Scraped_Text'] = text_results

In [63]:
data=pd.read_csv(r"C:\Users\Otman-INFO\Downloads\content.csv")
data.head()

Unnamed: 0,Title,Link,Scraped_Text
0,Cardboard and Waste Recycling: 3 Genius Recycl...,https://www.instructables.com/Cardboard-and-Wa...,Cardboard and Waste Recycling: 3 Genius Recycl...
1,Recycling PET Plastic to Filament,https://www.instructables.com/Recycling-PET-Pl...,Recycling PET Plastic to Filament : 7 Steps (w...
2,Recycling Bin for Kids Made From Recycled Card...,https://www.instructables.com/Recycling-Bin-fo...,Recycling Bin for Kids Made From Recycled Card...
3,Cardboard to Plant Pots by Recycling Waste Pap...,https://www.instructables.com/Cardboard-to-Pla...,Cardboard to Plant Pots by Recycling Waste Pap...
4,System of Recycling Trash (SORT),https://www.instructables.com/System-of-Recycl...,System of Recycling Trash (SORT) : 5 Steps - I...


## Data Preprocessing

In [64]:
def process_text(text):
    # Check if the input is a string, then process it
    if isinstance(text, str):
        # Step 1: Remove everything after the specified string ("\nCategories\nCircuits\n")
        text = text.split("\nCategories\nCircuits\n")[0]
        
        # Step 2: Replace newlines with spaces
        text = text.replace("\n", " ")
        
        # Step 3: Remove specific unwanted string
        text = text.replace("- Instructables                  Projects                               Contests                               Teachers", "")
        
        # Step 4: Replace multiple spaces with a single space
        text = re.sub(r'\s{2,}', ' ', text)
        
        # Step 5: Remove emojis
        emoji_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        
    return text

In [65]:
data["Scraped_Text"]=data["Scraped_Text"].apply(process_text)

In [66]:
data.head()

Unnamed: 0,Title,Link,Scraped_Text
0,Cardboard and Waste Recycling: 3 Genius Recycl...,https://www.instructables.com/Cardboard-and-Wa...,Cardboard and Waste Recycling: 3 Genius Recycl...
1,Recycling PET Plastic to Filament,https://www.instructables.com/Recycling-PET-Pl...,Recycling PET Plastic to Filament : 7 Steps (w...
2,Recycling Bin for Kids Made From Recycled Card...,https://www.instructables.com/Recycling-Bin-fo...,Recycling Bin for Kids Made From Recycled Card...
3,Cardboard to Plant Pots by Recycling Waste Pap...,https://www.instructables.com/Cardboard-to-Pla...,Cardboard to Plant Pots by Recycling Waste Pap...
4,System of Recycling Trash (SORT),https://www.instructables.com/System-of-Recycl...,System of Recycling Trash (SORT) : 5 Steps Sys...


In [67]:
data.isnull().sum()

Title           0
Link            0
Scraped_Text    0
dtype: int64

In [68]:
data.shape

(19868, 3)

In [69]:
data['Scraped_Text'].duplicated().sum()

4

In [70]:
data[data['Scraped_Text'].duplicated()]

Unnamed: 0,Title,Link,Scraped_Text
9483,Leather-Bound Floating Book Shelf,https://www.instructables.com/Leather-Bound-Fl...,"Page Not Found 404: We're sorry, things break ..."
9568,Make a Four-Tone Junk Trumpet,https://www.instructables.com/Make-a-Four-Tone...,"Page Not Found 404: We're sorry, things break ..."
9694,Keep Talking and Nobody Explodes Bomb,https://www.instructables.com/Keep-Talking-and...,"Page Not Found 404: We're sorry, things break ..."
13377,How To: Make Curved-Fold Origami Sculptures,https://www.instructables.com/How-To-Curved-Fo...,"Page Not Found 404: We're sorry, things break ..."


In [71]:
data = data[data['Scraped_Text'] != data['Scraped_Text'][9483]]

In [72]:
data.shape

(19863, 3)

In [73]:
data['Scraped_Text'].duplicated().sum()

0

### Extract Introduction , Supplies and Steps from Articles

In [74]:
import requests
import pandas as pd
import re
import json
import time


def custom_text_splitter(text):
    # Define regex to match section headers including "Supplies" and steps
    section_pattern = re.compile(r"(Supplies|Step \d+:|Step \d+|Introduction):", re.IGNORECASE)

    # Split the text at each match and keep the delimiter as part of the split
    splits = section_pattern.split(text)

    # Combine splits into header-content pairs
    result = []
    for i in range(1, len(splits), 2):  # Start at 1 to skip any pre-header content
        header = splits[i].strip()
        content = splits[i + 1].strip() if i + 1 < len(splits) else ""

        if header == 'Introduction':
            title = content.split('\n')[0]
            result.append({"header": 'Title', "content": title})
            supplies = content.split('Supplies')
            if len(supplies) > 1:
                supplies = supplies[1]
                result.append({"header": 'Supplies', "content": supplies})

        elif i == len(splits)-1 or i == len(splits)-2:
            split_words = ['Categories', 'Recommendations', 'Be the First to Share',
            'Did you make this project?', 'I Made It', 'Comment',
            'Comments', ]
            for word in split_words:
                if word in content:
                    # Split text on the first occurrence of the word
                    content = re.split(f"({re.escape(word)})", content, maxsplit=1)
                    content = content[0]

            result.append({"header": header, "content": content})

        else:
            result.append({"header": header, "content": content})

    return result

In [76]:
data['extract_steps...']=data['Scraped_Text'].apply(custom_text_splitter)

In [77]:
data['extract_steps...'][0]

[{'header': 'Title',
  'content': "Cardboard and Waste Recycling: 3 Genius Recycling Ideas That Will Surprise You! By MORENA DIY Follow More by the author: I excited to show you three genius recycling ideas that will inspire you to think differently about cardboard and other waste materials! ♻️✨ You won't believe how simple and enjoyable it is to turn everyday items, like used cardboard and other materials, into useful and creative projects. These eco-friendly DIY hacks are not only practical but also a great way to reduce waste and give new life to materials that would otherwise end up in the trash."},
 {'header': 'Step 1',
  'content': "DIY Flower Pot Stand or Storage Organizer In this project, we take simple cardboard and bamboo skewers and turn them into a stylish stand that’s perfect for flower pots or as a small storage organizer! - Measuring and Marking the Cardboard You are measuring the cardboard using a ruler and a pencil. To begin the project, place your cardboard on a flat 

### Save Finale Dataset

In [None]:
data.to_csv('finale_data_extracted.csv', index=False, encoding='utf-8')