<a href="https://colab.research.google.com/github/Takudzwamz/second_project/blob/main/AutomatedDataCollection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.
###Install packages


In [1]:
!python3 -m pip install beautifulsoup4
!python3 -m pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 2
### i. Разработка простейшей модели поискового робота с классическим алгоритмом сбора и обработки данных в сети Веб 1.0/Веб 2.0. 
### ii . Автоматизированный сбор данных с помощью простейшей модели поискового робота на основе специализированного алгоритма обхода на примере сайтов СПбГУ и МГУ – для Веб 1.0. 
### iii .Сбор статистики обработанных страниц для Веб 1.0: общее количество страниц и всех ссылок, количество внутренних страниц, количество неработающих страниц, количество внутренних поддоменов, общее количество ссылок на внешние ресурсы, количество уникальных внешних ресурсов, количество уникальных ссылок на файлы doc/docx/pdf.

In [45]:
%%file uni_crawler.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import pytest

# URLs to crawl
urls = ['https://spbu.ru/#', 'https://vsu.ru', 'https://www.msu.ru/']

# Initialize empty lists to store crawled links
all_links = []
internal_links = []
external_links = []
doc_links = []
subdomains = set()

# Loop through URLs and crawl links
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href:
            if href.startswith('http'):
                all_links.append(href)
                if url in href:
                    internal_links.append(href)
                    subdomains.add(href.split('/')[2])
                else:
                    external_links.append(href)
            elif href.endswith(('doc', 'docx', 'pdf')):
                doc_links.append(href)
                all_links.append(href)
            else:
                all_links.append(url + href)
                internal_links.append(url + href)

# Count statistics
num_pages = len(all_links)
num_internal_pages = len(internal_links)
num_broken_pages = num_pages - len(set(all_links))
num_subdomains = len(subdomains)
num_external_resources = len(external_links)
num_unique_external_resources = len(set(external_links))
num_unique_doc_links = len(set(doc_links))

# Create dataframe with statistics
df = pd.DataFrame({
    'Num Pages': [num_pages],
    'Num Internal Pages': [num_internal_pages],
    'Num Broken Pages': [num_broken_pages],
    'Num Internal Subdomains': [num_subdomains],
    'Num External Resources': [num_external_resources],
    'Num Unique External Resources': [num_unique_external_resources],
    'Num Unique Doc Links': [num_unique_doc_links]
})

# Save results and statistics to files
results_dir = 'results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
df.to_csv(os.path.join(results_dir, 'statistics.csv'), index=False)
with open(os.path.join(results_dir, 'links.txt'), 'w') as f:
    f.write('\n'.join(all_links))

# Test Scenario 1: Verify that the program collects all links from the MSU website
def test_msu_links():
    with open('results/links.txt', 'r') as f:
        links = f.read().splitlines()
    assert 'https://www.msu.ru/#' in links
    assert 'https://www.msu.ru//press/' in links
    assert 'https://www.msu.ru//ad/' in links
    assert 'https://www.msu.ru//science/' in links

# Test Scenario 2: Verify that the program collects all links from the MSU website
def test_vsu_links():
    with open('results/links.txt', 'r') as f:
        links = f.read().splitlines()
    assert 'https://vsu.ru/' in links
    assert 'https://vsu.ru/en' in links
    assert 'http://www.vsu.ru/sveden/' in links
    assert 'http://www.abitur.vsu.ru/' in links

# Test Scenario 3: Failing Test for https://spbu.ru, is protected making it hard for the crawler to crawl it
# Tried Multiple solutions but the https://spbu.ru is not being crawled.
def test_spbu_links():
    with open('results/links.txt', 'r') as f:
        links = f.read().splitlines()
    assert 'https://spbu.ru/#' in links
    

# Test Scenario 4: Verify that the program collects all internal links from the MSU website
def test_msu_internal_links():
    with open('results/links.txt', 'r') as f:
        links = f.read().splitlines()
    assert 'https://www.msu.ru//science/ad/' in links
    assert 'https://www.msu.ru//news/' in links
    assert 'https://www.msu.ru//address/' in links
    assert 'https://www.msu.ru//address/' in links

# Test Scenario 5: Verify that the program collects all external links from the MSU website
def test_msu_external_links():
    with open('results/links.txt', 'r') as f:
        links = f.read().splitlines()
    assert 'https://zen.yandex.ru/msu_official' in links
    assert 'https://t.me/naukamsu' in links
    assert 'http://vk.com/dnevnik_msu' in links
    assert 'http://vk.com/msu_official' in links

# Test Scenario 4: Verify that the program collects all internal links from the MSU website
def test_vsu_internal_links():
    with open('results/links.txt', 'r') as f:
        links = f.read().splitlines()
    assert 'https://vsu.ru/sveden/document' in links
    assert 'https://vsu.ru/sveden/common' in links
    assert 'https://vsu.ru/sveden/vacant' in links
    assert 'https://vsu.ru/ru/university/partnership/' in links

# Test Scenario 5: Verify that the program collects all external links from the MSU website
def test_vsu_external_links():
    with open('results/links.txt', 'r') as f:
        links = f.read().splitlines()
    assert 'https://t.me/vsumain' in links
    assert 'http://vk.com/vsumain' in links
    assert 'http://www.youtube.com/user/VSUPRESS?feature=watch' in links


# Test Scenario 6: Verify that the program collects the correct statistics for the MSU website
def test_all_statistics():
    df = pd.read_csv('results/statistics.csv')
    assert df.loc[0, 'Num Pages'] == 564
    assert df.loc[0, 'Num Internal Pages'] == 385
    assert df.loc[0, 'Num Broken Pages'] == 213
    assert df.loc[0, 'Num Internal Subdomains'] == 0
    assert df.loc[0, 'Num External Resources'] == 166
    assert df.loc[0, 'Num Unique External Resources'] == 100
    assert df.loc[0, 'Num Unique Doc Links'] == 12

# Test Scenario 7: Verify that the program creates the results directory and files
def test_results_directory_and_files():
    assert os.path.exists('results')
    assert os.path.exists('results/links.txt')
    assert os.path.exists('results/statistics.csv')



Overwriting uni_crawler.py


### Run Pytests


In [46]:
!python3 -m  pytest uni_crawler.py

platform linux -- Python 3.9.16, pytest-7.2.2, pluggy-1.0.0
rootdir: /content
plugins: anyio-3.6.2
collected 9 items                                                              [0m

uni_crawler.py [32m.[0m[32m.[0m[31mF[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[31m                                                 [100%][0m

[31m[1m_______________________________ test_spbu_links ________________________________[0m

    [94mdef[39;49;00m [92mtest_spbu_links[39;49;00m():[90m[39;49;00m
        [94mwith[39;49;00m [96mopen[39;49;00m([33m'[39;49;00m[33mresults/links.txt[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mr[39;49;00m[33m'[39;49;00m) [94mas[39;49;00m f:[90m[39;49;00m
            links = f.read().splitlines()[90m[39;49;00m
>       [94massert[39;49;00m [33m'[39;49;00m[33mhttps://spbu.ru/#[39;49;00m[33m'[39;49;00m [95min[39;49;00m links[90m[39;49;00m
[1m[31mE       AssertionError: assert 'https://spbu.ru/#' in ['ht