**SOCKETS :**
In web development, a socket is a fundamental communication mechanism that allows two applications
(often a client and a server) to establish a connection and exchange data over a network. 
It's like a virtual telephone line that enables two parties to communicate.

In [2]:
# USING THE MANUAL WAY

import socket
import ssl

HOST = 'www.google.com'  # Server hostname or IP address
PORT = 443               # HTTPS port
TIMEOUT = 10             # Set timeout in seconds

# Create a socket
client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client_socket.settimeout(TIMEOUT)  # Set the timeout

# Create an SSL context
context = ssl.create_default_context()

# Wrap the socket with SSL using the context
ssl_socket = context.wrap_socket(client_socket, server_hostname=HOST)

# Connect to the server
server_address = (HOST, PORT)
ssl_socket.connect(server_address)

# Send an HTTP GET request
request_header = b'GET / HTTP/1.0\r\nHost: www.google.com\r\n\r\n'
ssl_socket.sendall(request_header)

# Receive the response
response = ''
while True:
    try:
        recv = ssl_socket.recv(1024)
        if not recv:
            break
        # Decode with 'ignore' to handle non-UTF-8 bytes
        response += recv.decode('utf-8', errors='ignore')
    except socket.timeout:
        print("Socket connection timed out.")
        break

# Print the response
print(response)

# Close the connection
ssl_socket.close()


HTTP/1.0 200 OK
Date: Thu, 12 Sep 2024 14:53:50 GMT
Expires: -1
Cache-Control: private, max-age=0
Content-Type: text/html; charset=ISO-8859-1
Content-Security-Policy-Report-Only: object-src 'none';base-uri 'self';script-src 'nonce-RFkV5NGWt1k31FDF6IFslw' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/other-hp
Accept-CH: Sec-CH-Prefers-Color-Scheme
P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info."
Server: gws
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN
Set-Cookie: AEC=AVYB7coo3-ED3MQJKXt_AKn1IyE41bqpUJJyMat95n8C58Pll2vjDqjbbpo; expires=Tue, 11-Mar-2025 14:53:50 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax
Set-Cookie: NID=517=CbQOoInSTEtVD75yForNbN6RfS6g7m_pWfEyLHeYMPWhu89zYoihtcrVtziBcw83F6eLcx7mW2Qi7L2gG7E3mvb5FaukhzvEHlIM5INx-dWphi4F61c6iwPmxVxK_OxQZOgSve0BSU1zpKJj80JNfNedcKvMViJvAxKcOhxmJpqGiZWGtcG8idMMMEcKgu2T5UI; expires=Fri, 14-Mar-2025 14:53:50 GMT; path=/; domain=.

In [17]:
# USING THE urllib3

import urllib3
http = urllib3.PoolManager()
r = http.request('GET', 'http://www.google.com')
print(r.data)

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en-NG"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="a408XAF7vgUxsfvPafYZKg">(function(){var _g={kEI:\'qw3jZr_sPOWp2roPlP69sAE\',kEXPI:\'0,793110,2907155,1116,3,48,541485,2891,8348,98972,162433,23024,6699,41949,57734,2,2,1,24626,2006,8155,23350,22436,9779,38678,23980,44508,31700,15816,1804,7734,27534,11814,1632,9710,3785,15784,27083,5213672,140,7,4,56,38,44,5991145,2841697,120,23936608,2773423,1270287,16672,43887,3,1603,3,2124363,23029351,7950,1,212,4636,11731,4705,40142,43903,11733,5093,5797,15164,8182,28845,20584,21675,2482,4267,155,2484,13503,7736,9140,4599,328,3217,4,1238,1766,13967,5706,409,519,4858,5,857,1669,5634,686,930,1810,3,3934,1119,3,53,24,2757,181,1923,772,4475,1350,6973,3547,683,659,5849,2005,5799,55,2212,2,9,8567,1250,3591,2379,1484,981,5

**GETTING ALL LINKS IN AN HTML PAGE**

In [18]:
from lxml import html

# We reuse the response from urllib3
data_string = r.data.decode('utf-8', errors='ignore')

# We instantiate a tree object from the HTML
tree = html.fromstring(data_string)

# We run the XPath against this HTML
# This returns an array of elements
links = tree.xpath('//a')

for link in links:
    # For each element we can easily get back the URL
    print(link.get('href'))

https://www.google.com/imghp?hl=en&tab=wi
http://maps.google.com.ng/maps?hl=en&tab=wl
https://play.google.com/?hl=en&tab=w8
https://www.youtube.com/?tab=w1
https://news.google.com/?tab=wn
https://mail.google.com/mail/?tab=wm
https://drive.google.com/?tab=wo
https://www.google.com.ng/intl/en/about/products?tab=wh
http://www.google.com.ng/history/optout?hl=en
/preferences?hl=en
https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://www.google.com/&ec=GAZAAQ
/advanced_search?hl=en-NG&authuser=0
http://www.google.com/setprefs?sig=0_QTqbUk3tYcDn4iCdx8yEnTl6VDA%3D&hl=ha&source=homepage&sa=X&ved=0ahUKEwj_7fb64L2IAxXllFYBHRR_DxYQ2ZgBCAY
http://www.google.com/setprefs?sig=0_QTqbUk3tYcDn4iCdx8yEnTl6VDA%3D&hl=ig&source=homepage&sa=X&ved=0ahUKEwj_7fb64L2IAxXllFYBHRR_DxYQ2ZgBCAc
http://www.google.com/setprefs?sig=0_QTqbUk3tYcDn4iCdx8yEnTl6VDA%3D&hl=yo&source=homepage&sa=X&ved=0ahUKEwj_7fb64L2IAxXllFYBHRR_DxYQ2ZgBCAg
http://www.google.com/setprefs?sig=0_QTqbUk3tYcDn4iCdx8yEnTl6V

In [26]:
http = urllib3.PoolManager()
r = http.request('GET', 'https://www.vanguardngr.com/')
print(r.data)


# We reuse the response from urllib3
data_string = r.data.decode('utf-8', errors='ignore')

# We instantiate a tree object from the HTML
tree = html.fromstring(data_string)

# We run the XPath against this HTML
# This returns an array of elements
links = tree.xpath('//a')

for link in links:
    # For each element we can easily get back the URL
    print(link.get('href'))

b'<!DOCTYPE html><html lang="en-US"><head><title>Just a moment...</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="robots" content="noindex,nofollow"><meta name="viewport" content="width=device-width,initial-scale=1"><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131}button,html{font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji}body{display:flex;flex-direction:column;height:100vh;min-height:100vh}body.no-js .loading-spinner{visibility:hidden}body.theme-dark{background-color:#222;color:#d9d9d9}body.theme-dark a{color:#fff}body.theme-dark a:hover{color:#ee730a;text-decoration:underline}body.theme-dark .lds-ring div{border-color:#999 transparent transparent}body.theme-dark .font-red{color:#b20f03}body.theme-da

**Using Requests & BeautifulSoup**

In [31]:
import requests

r = requests.get('https://www.plsmind.com')
print(r.text)




<!DOCTYPE html>
<html style="overflow-x: hidden;">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">

    <title> Shop Anything - P.L.S. Mind </title>
    

<link rel="stylesheet" href="/static/css/dist/styles.css?v=1726157756">





    <meta name="description" content="An ecommerce website">
    <meta name="author" content="P.L.S. Mind">

    <meta name="keywords" class="m_keyword" content="ecommerce, e-commerce, buying and selliing, buy electrical items, best market place">
  
  <link rel="stylesheet" type="text/css" href="/static/css/homepage.css">
  <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" crossorigin="anonymous">
  <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js" integrity="sha384-C6RzsynM9kWDrMNeT87bh95OGNyZPhcTNXj

In [40]:
import requests

pls_img_location = '/media/cache/5a/38/5a3839d4d32f9acc5e25e665972cc575.jpg'
url = 'https://www.plsmind.com/media/cache/5a/38/5a3839d4d32f9acc5e25e665972cc575.jpg'
response = requests.get(url)
with open('image.jpg', 'wb') as file:
    file.write(response.content)

In [45]:
import requests
from bs4 import BeautifulSoup

r = requests.get('https://news.ycombinator.com')
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.findAll('tr', class_='athing')

formatted_links = []

for link in links:
    data = {
        'id': link['id'],
        'title': link.find_all('td')[2].a.text,
        "url": link.find_all('td')[2].a['href'],
        "rank": int(link.find_all('td')[0].span.text.replace('.', ''))
    }
    formatted_links.append(data)

print(formatted_links)
print(len(formatted_links))

for i in range(0,15):
    print(formatted_links[i])

[{'id': '41521919', 'title': 'Show HN: iFixit created a new USB-C, repairable soldering system', 'url': 'https://hackaday.com/2024/09/12/review-ifixits-fixhub-may-be-the-last-soldering-iron-you-ever-buy/', 'rank': 1}, {'id': '41520516', 'title': 'GAZEploit: Remote keystroke inference attack by gaze estimation in VR/MR devices', 'url': 'https://www.wired.com/story/apple-vision-pro-persona-eye-tracking-spy-typing/', 'rank': 2}, {'id': '41521002', 'title': 'If I could dissect a sauropod', 'url': 'https://svpow.com/2024/09/12/if-i-could-dissect-a-sauropod/', 'rank': 3}, {'id': '41519240', 'title': 'Kolmogorov-Arnold networks may make neural networks more understandable', 'url': 'https://www.quantamagazine.org/novel-architecture-makes-neural-networks-more-understandable-20240911/', 'rank': 4}, {'id': '41522196', 'title': 'Can LLMs Generate Novel Research Ideas?', 'url': 'https://www.arxiv.org/abs/2409.04109', 'rank': 5}, {'id': '41518600', 'title': 'Why Haskell?', 'url': 'https://www.gtf.io

**BeautifulSoup Totorial**

In [51]:
from bs4 import BeautifulSoup
import requests

In [52]:
url = 'https://www.google.com/'
page = requests.get(url)

In [54]:
print(page) #this tells us the state of the connection

<Response [200]>


In [60]:
soup = BeautifulSoup(page.text, 'html')
soup

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-NG"><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/><title>Google</title><script nonce="M4adrLbjWvszkgvnxvTNRw">(function(){var _g={kEI:'h1TjZv3aGpCs4-EPkZydgQg',kEXPI:'0,56801,736543,2906925,1112,2,49,538613,2872,2891,8348,34680,64288,60058,102379,23024,6700,28950,12998,54822,2912,2,2,1,26632,8155,8860,14490,22436,9779,62657,44508,31701,15816,1804,7734,39348,1635,29276,27083,5203198,12,6217,4245,140,8,2,94,45,5991145,2841697,58,62,27980318,16672,43887,3,1603,3,2124363,23029351,8163,4636,14986,1450,49022,35023,13314,5954,3355,12952,2212,3756,4426,14872,3003,31554,21669,6755,155,1759,5,720,13503,7736,7042,2098,4599,160,168,546,2671,4,1238,1766,1117,6469,5,12083,409,518,5720,1669,5633,688,2739,3,3931,617,505,3,3015,2695,4479,6082,5011,774,581,101,664,5401,441,7861,2212,2,9,8567,4841,2381,1

In [58]:
# To get a neater version for readerbility

print(soup.prettify())

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-NG">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   Google
  </title>
  <script nonce="M4adrLbjWvszkgvnxvTNRw">
   (function(){var _g={kEI:'h1TjZv3aGpCs4-EPkZydgQg',kEXPI:'0,56801,736543,2906925,1112,2,49,538613,2872,2891,8348,34680,64288,60058,102379,23024,6700,28950,12998,54822,2912,2,2,1,26632,8155,8860,14490,22436,9779,62657,44508,31701,15816,1804,7734,39348,1635,29276,27083,5203198,12,6217,4245,140,8,2,94,45,5991145,2841697,58,62,27980318,16672,43887,3,1603,3,2124363,23029351,8163,4636,14986,1450,49022,35023,13314,5954,3355,12952,2212,3756,4426,14872,3003,31554,21669,6755,155,1759,5,720,13503,7736,7042,2098,4599,160,168,546,2671,4,1238,1766,1117,6469,5,12083,409,518,5720,1669,5633,688,2739,3,3931,617,505,3,3015,2695,4479,6082,5011,774,581,101,664,5401,441,7861,

**FIND AND FIND ALL**

In [61]:
url = 'https://www.scrapethissite.com/pages/forms/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
soup


<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robot

In [62]:
soup.find('div') #this finds your first div response

<div class="container">
<div class="col-md-12">
<ul class="nav nav-tabs">
<li id="nav-homepage">
<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>
</li>
<li id="nav-sandbox">
<a class="nav-link" href="/pages/">
<i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                Sandbox
                            </a>
</li>
<li id="nav-lessons">
<a class="nav-link" href="/lessons/">
<i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                Lessons
                            </a>
</li>
<li id="nav-faq">
<a class="nav-link" href="/faq/">
<i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                FAQ
                            </a>
</li>
<li class="pull-right" id="nav-login">
<a class="nav-link" href="/login/">
                                Login

In [63]:
soup.find_all('div')

[<div class="container">
 <div class="col-md-12">
 <ul class="nav nav-tabs">
 <li id="nav-homepage">
 <a class="nav-link hidden-sm hidden-xs" href="/">
 <img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                 Scrape This Site
                             </a>
 </li>
 <li id="nav-sandbox">
 <a class="nav-link" href="/pages/">
 <i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                 Sandbox
                             </a>
 </li>
 <li id="nav-lessons">
 <a class="nav-link" href="/lessons/">
 <i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                 Lessons
                             </a>
 </li>
 <li id="nav-faq">
 <a class="nav-link" href="/faq/">
 <i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                 FAQ
                             </a>
 </li>
 <li class="pull-right" id="nav-login">
 <a class="nav-link" href="/login/">
        

**Finding multiple agurments**

In [64]:
soup.find_all('tr', class_ = 'team')

[<tr class="team">
 <td class="name">
                             Boston Bruins
                         </td>
 <td class="year">
                             1990
                         </td>
 <td class="wins">
                             44
                         </td>
 <td class="losses">
                             24
                         </td>
 <td class="ot-losses">
 </td>
 <td class="pct text-success">
                             0.55
                         </td>
 <td class="gf">
                             299
                         </td>
 <td class="ga">
                             264
                         </td>
 <td class="diff text-success">
                             35
                         </td>
 </tr>,
 <tr class="team">
 <td class="name">
                             Buffalo Sabres
                         </td>
 <td class="year">
                             1990
                         </td>
 <td class="wins">
                             3

In [65]:
soup.find_all('p', class_ = 'lead')

[<p class="lead">
                             Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                             Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                         </p>]

**geting the text**