# ICT MINI PROJECT

## SEO - Search Engine Optimization

## Module 1 : Scraping Google Search Results

In [None]:
pip install requests_html

In [None]:
pip install bs4

### Importing required modules for the project

In [2]:
import requests
from bs4 import BeautifulSoup
import urllib

import json

import pandas

### Main code : Implemention of the application

In [3]:
def search_google():
    print("Search Google : ")
    search_query = input()
    
    query = urllib.parse.quote_plus(search_query)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
    url = 'https://www.google.com/search?q={query}'.format(query=query)
    html = requests.get(url, headers=headers)

    soup = BeautifulSoup(html.text, 'html.parser')

    allData = soup.find_all("div", {"class": "g"})

    results = parse_data(allData)
    
    seo_info_hold = []
    err_links = []
    for i in results:
        try:
            value = extract_info(i["link"])
            seo_info_hold.append(value)
        except:
            err_links.append(i["link"])
    
    print("Scraped SEO results : \n")
    display_results(seo_info_hold)
    
    if(len(err_links) > 0):
        print("\nCan't able to scrape these links : \n")
        display_results(err_links)
        
    store_results(seo_info_hold)

### Retrive top 10 link of the Google Search based on the given query string

In [4]:
def parse_data(allData):
    g = 0
    Data = []
    l = {}
    for i in range(0, len(allData)):
        link = allData[i].find('a').get('href')

        if (link is not None):
            if (link.find('https') != -1 and link.find('http') == 0 and link.find('aclk') == -1):
                g = g+1
                l["link"] = link
                try:
                    l["title"] = allData[i].find('h3').text
                except:
                    l["title"] = None
                
                
                Data.append(l)
                
                l = {}

            else:
                continue

        else:
            continue
    return Data

### Display the given data in a formatted way

In [5]:
def display_results(json_data):
    print(json.dumps(json_data, indent = 2))

## Module 2 : Scraping each URL to extract SEO information

### Scraping SEO information for the links scraped from Google Search

In [6]:
def extract_info(URL):
    seo_info = {}
    html = requests.get(URL)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    seo_info["link"] = URL
    try:
        seo_info["metatitle"] = (soup.find('title')).get_text()
    except:
        seo_info["metatitle"] = None
    try:
        seo_info["metadescription"] = soup.find('meta',attrs={'name':'description'})["content"]
    except:
        seo_info["metadescription"] = None
    try:
        seo_info["robots directives"] = soup.find('meta',attrs={'name':'robots'})["content"].split(",")
    except:
        seo_info["robots directives"] = None
    try:
        seo_info["viewport"] = soup.find('meta',attrs={'name':'viewport'})["content"]
    except:
        seo_info["viewport"] = None
    try:
        seo_info["charset"] = soup.find('meta',attrs={'charset':True})["charset"]
    except:
        seo_info["charset"] = None
    try:
        seo_info["html language"] = soup.find('html')["lang"]
    except:
        seo_info["html language"] = None
    
    try:
        seo_info["canonical"] = soup.find('link',attrs={'rel':'canonical'})["href"]
    except:
        seo_info["canonical"] = None
    try:
        seo_info["list hreflangs"] = [[a['href'], a["hreflang"]] for a in soup.find_all('link', href=True, hreflang=True)]
    except:
        seo_info["list hreflangs"] = None
    try:
        seo_info["mobile alternate"] = soup.find('link',attrs={'media':'only screen and (max-width: 640px)'})["href"]
    except:
        seo_info["mobile alternate"] = None
    
    return seo_info

#### Output :

In [7]:
extract_info("https://www.candy.com")

{'link': 'https://www.candy.com',
 'metatitle': 'Home - Candy',
 'metadescription': 'Fandom, reimagined. Only at Candy.com',
 'robots directives': ['index', 'follow'],
 'viewport': 'initial-scale=1.0, width=device-width',
 'charset': 'utf-8',
 'html language': 'en',
 'canonical': None,
 'list hreflangs': [],
 'mobile alternate': None}

## Module 3 : Tabulating the results

In [8]:
def store_results(data):

    df = pandas.DataFrame(data)

    try:
        print("\nEnter a file name to save your data : ")
        file_name = input()
        writer = pandas.ExcelWriter(file_name)
        df.to_excel(writer, encoding='utf8', index=False)
        writer.save()
        print("Your data successfully saved in " + file_name)
    except:
        print("Error : Can't able to save data to excel file.")

In [9]:
search_google()

Search Google : 
information technology
Scraped SEO results : 

[
  {
    "link": "https://en.wikipedia.org/wiki/Information_technology",
    "metatitle": "Information technology - Wikipedia",
    "metadescription": null,
    "robots directives": [
      "max-image-preview:standard"
    ],
    "viewport": "width=1000",
    "charset": "UTF-8",
    "html language": "en",
    "canonical": "https://en.wikipedia.org/wiki/Information_technology",
    "list hreflangs": [],
    "mobile alternate": null
  },
  {
    "link": "https://www.techtarget.com/searchdatacenter/definition/IT",
    "metatitle": "What is Information Technology? Definition and Examples",
    "metadescription": "Learn what information technology is, the role of software and hardware, the difference between IT and computer science, IT job functions and careers.",
    "robots directives": [
      "noodp"
    ],
    "viewport": "width=device-width,initial-scale=1",
    "charset": "utf-8",
    "html language": "en",
    "canonic