<a href="https://colab.research.google.com/github/Samar-mami/NewsWebSite/blob/main/NewsWebsiteCrawl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Web Scraping with Python

In [None]:
!python -m pip install pymongo

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pymongo import MongoClient
from datetime import datetime

In [32]:
#Get the author of the article
def get_author_article(url):
  res = requests.get(url)
  soup = BeautifulSoup(res.text,'html.parser')
  author_tag = soup.find('a', {'data-link-name': 'auto tag link', 'rel': 'author'})
  if author_tag:
    author_name = author_tag.get_text()
    author = author_name
  else:
    author = ''
  return author

In [34]:
#Get the text of the article
def get_article_text(url):
  res = requests.get(url)
  soup = BeautifulSoup(res.text,'html.parser')
  text = [p.get_text(strip=True) for p in soup.find_all('p')]
  text = ' '.join(text)
  return text

In [33]:
#Create a function to get the time article
def get_date_article(url):
  res = requests.get(url)
  soup = BeautifulSoup(res.text,"html.parser")
  date_string_test = soup.find('span',class_="dcr-u0h1qy")
  if date_string_test:
    date_string=date_string_test.get_text(strip=True)
    date_string = date_string[:-4]
    date_format = "%a %d %b %Y %H.%M"
 # if ('\xa0' in date_string):
  #  date_string = date_string.replace('\xa0', ' ')
    datetime_object = datetime.strptime(date_string, date_format)
    return  datetime_object
  else:
    return datetime(9999, 9, 9, 9, 9)
  #formatted_datetime = datetime_object.strftime("%Y-%m-%d %H:%M")


In [40]:
#Create a dataset from scraping a news website
def scrap_website(url):
  response = requests.get(url)
  # Create BeautifulSoup object
  soup = BeautifulSoup(response.text, 'html.parser')
  #Create an empty list for each element
  headlines = [] 
  article_urls = []
  descriptions = []
  kickers = []
  authors = []
  publication_dates = []
  article_texts = []
  # Find all article elements
  articles = soup.find_all('div', class_='fc-item__container')
  # Iterate over the articles and extract desired information
  for article in articles:
    # Extract the article title
    headline_elem = article.find('a', class_='u-faux-block-link__overlay')
    headline = headline_elem.get_text(strip=True) if headline_elem else ""
    headlines.append(headline)
      # Extract the article link
    article_url = article.find('a', class_='u-faux-block-link__overlay')['href']
    article_urls.append(article_url)
      # Extract the article author
    description_elem = article.find('div', class_='fc-item__standfirst')
    description = description_elem.get_text(strip=True) if description_elem else ""
    descriptions.append(description)
      # Extract the article kicker
    kicker_elem = article.find('div', class_='fc-item__kicker')
    kicker = kicker_elem.get_text(strip=True) if kicker_elem else ""
    kickers.append(kicker)
    #Get authors
    author = get_author_article(article_url)
    authors.append(author)
    #Get publication date
    publication_date = get_date_article(article_url)
    publication_dates.append(publication_date)
    #Get article texts
    article_text = get_article_text(article_url)
    article_texts.append(article_text)
# Create a dataframe from the extracted information
  df = pd.DataFrame({
      'Title': headlines,
      'Link': article_urls,
      'Description': descriptions,
      'Kicker': kickers,
      'author' : authors,
      'publication_date' : publication_dates,
      'article_text' : article_texts
  })
# return the final dataframe
  return df

In [41]:
df=scrap_website("https://www.theguardian.com")

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             92 non-null     object
 1   Link              92 non-null     object
 2   Description       92 non-null     object
 3   Kicker            92 non-null     object
 4   author            92 non-null     object
 5   publication_date  92 non-null     object
 6   article_text      92 non-null     object
dtypes: object(7)
memory usage: 5.2+ KB


#Connect to MongoDB

In [None]:
# from pymongo.mongo_client import MongoClient
# uri = "mongodb+srv://samarmami18:fjkTmLTPUsiQyTnL@cluster0.38i5zz9.mongodb.net/?retryWrites=true&w=majority"
# # Create a new client and connect to the server
# client = MongoClient(uri)
# # Send a ping to confirm a successful connection
# try:
#     client.admin.command('ping')
#     print("Pinged your deployment. You successfully connected to MongoDB!")
# except Exception as e:
#     print(e)

In [47]:
# Connect to MongoDB client and database/collection
def connect_mongo_db_collection(username,password,db,collection):
  uri = "mongodb+srv://"+username+":"+password+"@cluster0.38i5zz9.mongodb.net/?retryWrites=true&w=majority"
  try:
    client =  MongoClient(uri)
    db = client[db]
    collection = db[collection]
    collection_stats = collection.stats
    #print(collection_stats)
    print("Connection to MongoDB collection successful :)\n")
  except Exception as e:
    print("Connection to MongoDB collection failed:", str(e))
  return collection

In [48]:
#Insert the dataframe tp mongodb
def store_data(df):
  if 'index' not in df.columns:
    df.reset_index(inplace=True)
  data_dict = df.to_dict("records")
  #connect to database
  collection = connect_mongo_db_collection('samarmami18','fjkTmLTPUsiQyTnL','Data_engineering_test','WebSiteNews')
  #Insert collection
  collection.delete_many({})
  print("files deleted")
  try:
    # Delete existing documents from the collection
    collection.insert_many(data_dict)
    print("Data inserted/replaced successfully in MongoDB collection")
  except Exception as e:
    print("Error occurred:", str(e))

In [49]:
store_data(df)

Connection to MongoDB collection successful :)

files deleted
Data inserted/replaced successfully in MongoDB collection


# Create an API

In [50]:
from flask import Flask, jsonify, request

In [None]:
app = Flask(__name__)
collection = connect_mongo_db_collection('samarmami18','fjkTmLTPUsiQyTnL','Data_engineering_test','WebSiteNews')


Connection to MongoDB collection successful :)



In [None]:
@app.route('/articles', methods=['GET'])
def get_articles():
    articles = list(collection.find({}, {'_id': 0}))
    return jsonify(articles)

In [None]:
@app.route('/articles/<string:url>', methods=['GET'])
def get_article(url):
    article = collection.find_one({'url': url}, {'_id': 0})
    if article:
        return jsonify(article)
    else:
        return jsonify({'error': 'Article not found'}), 404

In [None]:
if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [1]:
from flask import Flask

app = Flask(__name__)

@app.route("/")
def hello_world():
    return "<p>Hello, World!</p>"

In [4]:
app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
