In [None]:
#import necessary libraries

import re
import csv
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup


import warnings
warnings.filterwarnings('ignore')

In [None]:

# define a dictionary of categories and the source URLs to the urls for the articles to be scrapped
NLP_PROJECT = {
    'arts_and_entertainment': 'https://articlebiz.com/category/arts-entertainment?page=',
    'foods_and_drinks': 'https://articlebiz.com./category/foods-drinks?page=',
    'family': 'https://articlebiz.com./category/family?page=',
    'business': 'https://articlebiz.com./category/business?page=',
    'home': 'https://articlebiz.com./category/home?page=',
    'pets': 'https://articlebiz.com./category/pets?page=',
    'shopping': 'https://articlebiz.com./category/shopping?page=',
    'news_and_society': 'https://articlebiz.com./category/news-society?page=',
    'self_improvement': 'https://articlebiz.com./category/self-improvement?page=',
    'computers_and_technology': 'https://articlebiz.com./category/computers-technology?page=',
    'autos_and_trucks': 'https://articlebiz.com./category/autos-trucks?page=',
    'sports_and_recreations': 'https://articlebiz.com./category/sports-recreation?page=',
    'finance': 'https://articlebiz.com./category/finance?page=',
    'social_issue': 'https://articlebiz.com./category/social-issue?page=',
    'reference_and_education': 'https://articlebiz.com./category/reference-education?page=',
    'health_and_fitness': 'https://articlebiz.com./category/health-fitness?page=',
    'travel_and_leisure': 'https://articlebiz.com./category/travel-leisure?page='
}

# create an DataFrame to contain category and urls as columns
df = pd.DataFrame(columns=['category', 'url'])

# iterate through the categories and URLs and append them to the created DataFrame
for category, url in NLP_PROJECT.items():
    df = df.append({'category': category, 'url': url}, ignore_index=True)

# save the DataFrame to a CSV file
df.to_csv('NLP_PROJECT.csv', index=False)



In [None]:
#read the file containing the source urls with pandas
df = pd.read_csv('../Downloads/NLP_PROJECT.csv')
df

Unnamed: 0,category,url
0,arts_and_entertainment,https://articlebiz.com/category/arts-entertain...
1,foods_and_drinks,https://articlebiz.com./category/foods-drinks?...
2,family,https://articlebiz.com./category/family?page=
3,business,https://articlebiz.com./category/business?page=
4,home,https://articlebiz.com./category/home?page=
5,pets,https://articlebiz.com./category/pets?page=
6,shopping,https://articlebiz.com./category/shopping?page=
7,news_and_society,https://articlebiz.com./category/news-society?...
8,self_improvement,https://articlebiz.com./category/self-improvem...
9,computers_and_technology,https://articlebiz.com./category/computers-tec...


In [None]:
# create an empty list to store the scraped data
data = []

# Loop through each category and URL
for index, row in df.iterrows():
    category = row['category']
    url = row['url']
    #create a range for the number of pages to scrape article urls from
    for page in range(1, 40):
        #try, except
        try:
            # Send a ge request to the URL with the page number
            response = requests.get(url + str(page) + '=')

            # Parse the HTML content with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            #get the article urls
            article_links = soup.find_all('a')

            # Loop through all the links and add them to the list
            # with the category and URL

            for link in article_links:
                href = link.get('href')

                #add a condition to identify the exact url to scrape
                if href and '105' in href:
                    data.append({'category': category, 'url': href})

            # Delay for 1 second before sending another request
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            # If there is an exception, print an error message
            print(f"Error scraping URL {url} on page {page}: {e}")

# Create a pandas DataFrame to store the scraped data
new_df = pd.DataFrame(data, columns=["category", "url"])


In [None]:
new_df

Unnamed: 0,category,url
0,arts_and_entertainment,https://articlebiz.com/article/1052085015-the-...
1,arts_and_entertainment,https://articlebiz.com/article/1052084278-how-...
2,arts_and_entertainment,https://articlebiz.com/article/1052083736-a-be...
3,arts_and_entertainment,https://articlebiz.com/article/1052076339-the-...
4,arts_and_entertainment,https://articlebiz.com/article/1052071386-acou...
...,...,...
11695,travel_and_leisure,https://articlebiz.com/article/1052050315-10-o...
11696,travel_and_leisure,https://articlebiz.com/article/1052048720-clea...
11697,travel_and_leisure,https://articlebiz.com/article/1052048506-hot-...
11698,travel_and_leisure,https://articlebiz.com/article/1052048496-what...


In [None]:
#save the dataframe as csv file
new_df.to_csv('ARTICLEBIZ.csv', index = False)

In [None]:
from google.colab import files

uploaded = files.upload()


Saving ARTICLEBIZ.csv to ARTICLEBIZ.csv


In [None]:
import io
data = io.BytesIO(uploaded['ARTICLEBIZ.csv'])

In [None]:
views_df = pd.read_csv(data)
views_df.head()

Unnamed: 0,category,url
0,arts_and_entertainment,https://articlebiz.com/article/1052085015-the-...
1,arts_and_entertainment,https://articlebiz.com/article/1052084278-how-...
2,arts_and_entertainment,https://articlebiz.com/article/1052083736-a-be...
3,arts_and_entertainment,https://articlebiz.com/article/1052076339-the-...
4,arts_and_entertainment,https://articlebiz.com/article/1052071386-acou...


In [None]:

#create a function to scrape articles and other needed data from the urls

def scrape_articles(column):
    article_data = []
    for url in column:
        try:
            # Send a request to the article URL and get its HTML content
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')


             #scrape the author's name
            author = soup.find('span', class_ = 'text-muted').text



            # Scrape the article title
            title = soup.find('h3', class_='mb-5').text.strip()

            # Scrape the article content
            content = soup.find_all('p')


            # Scrape the number of views
            views = soup.find('div', class_='text-muted').text.strip()
            # Append the scraped data to a list
            article_data.append({'author': author, 'title': title, 'content': content, 'views': views})
        except:
            # If there is an error, skip this URL and continue to the next one
            continue

    # Convert the list of dictionaries to a dataframe and return it
    article_df = pd.DataFrame(article_data)
     #concatenate the scrapped data with the 'views_df' dataframe
    scrapped_df = pd.concat([views_df, article_df], axis = 1)
    return scrapped_df



In [None]:
df = scrape_articles(views_df['url'])

In [None]:
df

Unnamed: 0,category,url,author,title,content,views
0,arts_and_entertainment,https://articlebiz.com/article/1052085015-the-...,Leon Colbert,The Marvelous World of Cigar Box Guitars: A Te...,"[[\n, [Arts & Entertainment], \n → ...",This article has been viewed 405 times.
1,arts_and_entertainment,https://articlebiz.com/article/1052084278-how-...,Ann Liu,How to Shoot Better Product Photography?,"[[\n, [Arts & Entertainment], \n], [When it co...",This article has been viewed 381 times.
2,arts_and_entertainment,https://articlebiz.com/article/1052083736-a-be...,Ann Liu,A Beginner's Guide to Pottery Clay: Getting St...,"[[\n, [Arts & Entertainment], \n], [Welcome to...",This article has been viewed 319 times.
3,arts_and_entertainment,https://articlebiz.com/article/1052076339-the-...,Cornelius Hart,The Evolution of Taylor Swifts Fashion: 8 Icon...,"[[\n, [Arts & Entertainment], \n], [Taylor Swi...",This article has been viewed 266 times.
4,arts_and_entertainment,https://articlebiz.com/article/1052071386-acou...,Spencer Miles,Acoustics Doesn't Matter?,"[[\n, [Arts & Entertainment], \n], [Lancaster ...",This article has been viewed 320 times.
...,...,...,...,...,...,...
11695,travel_and_leisure,https://articlebiz.com/article/1052050315-10-o...,Alex Belsey,10 Of The Best Places To Visit On A European R...,"[[\n, [Travel & Leisure], \n → , [V...",This article has been viewed 271 times.
11696,travel_and_leisure,https://articlebiz.com/article/1052048720-clea...,Rick,"""Clear Vision in Liverpool-Knowsley-Merseyside...","[[\n, [Travel & Leisure], \n → , [T...",This article has been viewed 257 times.
11697,travel_and_leisure,https://articlebiz.com/article/1052048506-hot-...,Andrew Lang,"Hot Dog, Jumping Frog: Things to See and Do in...","[[\n, [Travel & Leisure], \n → , [T...",This article has been viewed 223 times.
11698,travel_and_leisure,https://articlebiz.com/article/1052048496-what...,Andrew Lang,What are the More Obscure Places to Visit in S...,"[[\n, [Travel & Leisure], \n → , [T...",This article has been viewed 270 times.


In [None]:


# Extract the numbers from each sentence using regular expressions
numbers = [int(re.findall(r'\d{1,3}(?:,\d{3})*', sentence)[0].replace(',', '')) for sentence in df['views']]

df['views'] = numbers
df

Unnamed: 0,category,url,author,title,content,views
0,arts_and_entertainment,https://articlebiz.com/article/1052085015-the-...,Leon Colbert,The Marvelous World of Cigar Box Guitars: A Te...,"[[\n, [Arts & Entertainment], \n → ...",405
1,arts_and_entertainment,https://articlebiz.com/article/1052084278-how-...,Ann Liu,How to Shoot Better Product Photography?,"[[\n, [Arts & Entertainment], \n], [When it co...",381
2,arts_and_entertainment,https://articlebiz.com/article/1052083736-a-be...,Ann Liu,A Beginner's Guide to Pottery Clay: Getting St...,"[[\n, [Arts & Entertainment], \n], [Welcome to...",319
3,arts_and_entertainment,https://articlebiz.com/article/1052076339-the-...,Cornelius Hart,The Evolution of Taylor Swifts Fashion: 8 Icon...,"[[\n, [Arts & Entertainment], \n], [Taylor Swi...",266
4,arts_and_entertainment,https://articlebiz.com/article/1052071386-acou...,Spencer Miles,Acoustics Doesn't Matter?,"[[\n, [Arts & Entertainment], \n], [Lancaster ...",320
...,...,...,...,...,...,...
11695,travel_and_leisure,https://articlebiz.com/article/1052050315-10-o...,Alex Belsey,10 Of The Best Places To Visit On A European R...,"[[\n, [Travel & Leisure], \n → , [V...",271
11696,travel_and_leisure,https://articlebiz.com/article/1052048720-clea...,Rick,"""Clear Vision in Liverpool-Knowsley-Merseyside...","[[\n, [Travel & Leisure], \n → , [T...",257
11697,travel_and_leisure,https://articlebiz.com/article/1052048506-hot-...,Andrew Lang,"Hot Dog, Jumping Frog: Things to See and Do in...","[[\n, [Travel & Leisure], \n → , [T...",223
11698,travel_and_leisure,https://articlebiz.com/article/1052048496-what...,Andrew Lang,What are the More Obscure Places to Visit in S...,"[[\n, [Travel & Leisure], \n → , [T...",270


In [None]:
(df['content'][1])

[<p>
 <a href="https://articlebiz.com/category/arts-entertainment">Arts &amp; Entertainment</a>
 </p>,
 <p>When it comes to shooting better product photography, attention to detail and a keen eye for composition are essential. Every element, from lighting to background, plays a crucial role in capturing the essence and allure of the product. By carefully selecting the right equipment, utilising proper lighting techniques, and composing the shots with precision, one can transform ordinary products into captivating visual stories. It's about highlighting the product's unique features, showcasing its quality and craftsmanship, and creating an emotional connection with the viewer. Whether it's through capturing intricate details or presenting products in a lifestyle context, mastering the art of product photography opens doors to unlocking the true potential of each item, leaving a lasting impression on customers and elevating the brand to new heights.</p>,
 <p>Background</p>,
 <p>Choosing

In [None]:
from google.colab import files
df.to_csv('Article_views.csv', encoding = 'utf-8-sig')
files.download('Article_views.csv')