From 17e1abc496bc8d71c8e0091622b48507e2388082 Mon Sep 17 00:00:00 2001 From: Aditya Jetely Date: Fri, 25 Sep 2020 19:57:57 +0530 Subject: [PATCH] PyCon Proposal Scrapper --- .../Project_euler_scraper.py | 0 .../README.md | 0 .../requirements.txt | 0 .../result_snapshot.JPG | Bin .../Web_Scrappers/Pycon_Proposals/README.md | 17 ++++ .../Pycon_Proposals/pycon_proposals.py | 77 ++++++++++++++++++ .../Pycon_Proposals/requirements.txt | 8 ++ 7 files changed, 102 insertions(+) rename Scripts/Web_Scrappers/{Project Euler Scraper => Project_Euler_Scraper}/Project_euler_scraper.py (100%) rename Scripts/Web_Scrappers/{Project Euler Scraper => Project_Euler_Scraper}/README.md (100%) rename Scripts/Web_Scrappers/{Project Euler Scraper => Project_Euler_Scraper}/requirements.txt (100%) rename Scripts/Web_Scrappers/{Project Euler Scraper => Project_Euler_Scraper}/result_snapshot.JPG (100%) create mode 100644 Scripts/Web_Scrappers/Pycon_Proposals/README.md create mode 100644 Scripts/Web_Scrappers/Pycon_Proposals/pycon_proposals.py create mode 100644 Scripts/Web_Scrappers/Pycon_Proposals/requirements.txt diff --git a/Scripts/Web_Scrappers/Project Euler Scraper/Project_euler_scraper.py b/Scripts/Web_Scrappers/Project_Euler_Scraper/Project_euler_scraper.py similarity index 100% rename from Scripts/Web_Scrappers/Project Euler Scraper/Project_euler_scraper.py rename to Scripts/Web_Scrappers/Project_Euler_Scraper/Project_euler_scraper.py diff --git a/Scripts/Web_Scrappers/Project Euler Scraper/README.md b/Scripts/Web_Scrappers/Project_Euler_Scraper/README.md similarity index 100% rename from Scripts/Web_Scrappers/Project Euler Scraper/README.md rename to Scripts/Web_Scrappers/Project_Euler_Scraper/README.md diff --git a/Scripts/Web_Scrappers/Project Euler Scraper/requirements.txt b/Scripts/Web_Scrappers/Project_Euler_Scraper/requirements.txt similarity index 100% rename from Scripts/Web_Scrappers/Project Euler Scraper/requirements.txt rename to Scripts/Web_Scrappers/Project_Euler_Scraper/requirements.txt diff --git a/Scripts/Web_Scrappers/Project Euler Scraper/result_snapshot.JPG b/Scripts/Web_Scrappers/Project_Euler_Scraper/result_snapshot.JPG similarity index 100% rename from Scripts/Web_Scrappers/Project Euler Scraper/result_snapshot.JPG rename to Scripts/Web_Scrappers/Project_Euler_Scraper/result_snapshot.JPG diff --git a/Scripts/Web_Scrappers/Pycon_Proposals/README.md b/Scripts/Web_Scrappers/Pycon_Proposals/README.md new file mode 100644 index 000000000..d148aa3ac --- /dev/null +++ b/Scripts/Web_Scrappers/Pycon_Proposals/README.md @@ -0,0 +1,17 @@ +## Pycon_Proposals + +### This script scrapes the selected proposals and the total proposals from the PyCon Website and stores these proposals into two seperate csv files. + +### How to use this script? + +1. Make sure all the requirements for the script are present in your system by running: + + pip install -r requirements.txt + +2. Run the following command: + + python pycon_proposals.py + +### Author + +[Aditya Jetely](https://github.com/AdityaJ7) \ No newline at end of file diff --git a/Scripts/Web_Scrappers/Pycon_Proposals/pycon_proposals.py b/Scripts/Web_Scrappers/Pycon_Proposals/pycon_proposals.py new file mode 100644 index 000000000..9a3ea680d --- /dev/null +++ b/Scripts/Web_Scrappers/Pycon_Proposals/pycon_proposals.py @@ -0,0 +1,77 @@ +import requests +from bs4 import BeautifulSoup as bs +import pandas as pd + + +def scrape_divs(): + """This function scrapes all the proposal elements and stores them + in a list. + """ + response = requests.get("https://in.pycon.org/cfp/2020/proposals/") + soup = bs(response.content, 'html.parser') + mydivs = soup.findAll("div", {"class": "col-sm-11 col-xs-12"}) + return mydivs + + +def selected_proposals(mydivs, df_columns): + """This function takes the list of selected proposal elements from the + scarpe_divs function as well as a list of columns and stores the value + of the elements in a csv file. + Args: + mydivs (list): List of proposal elements + df_columns (list): List of column names + """ + final = {} + for i, div in enumerate(mydivs[:43]): + title = div.text + titlex = title.split("\n") + test_list = list(filter(lambda x: x != '', titlex)) + no_of_votes = test_list[2] + no_of_messages = test_list[0] + title = test_list[4] + tag1 = test_list[5] + tag2 = test_list[7] + author = test_list[11].strip() + date = test_list[14].strip() + final[i] = [no_of_votes, + no_of_messages, title, tag1, tag2, author, date] + + df1 = pd.DataFrame.from_dict(final, orient='index') + df1.columns = df_columns + df1.to_csv('selected_proposals.csv') + + +def total_proposals(mydivs, df_columns): + """This function takes the list of total proposal elements from the scarpe_divs + function as well as a list of columns and stores the value of the + elements in a csv file. + Args: + mydivs (list): List of proposal elements + df_columns (list): List of column names + """ + final_two = {} + for i, div in enumerate(mydivs[43:]): + title = div.text + titlex = title.split("\n") + test_list = list(filter(lambda x: x != '', titlex)) + no_of_votes = test_list[2] + no_of_messages = test_list[0] + title = test_list[4] + tag1 = test_list[6] + tag2 = test_list[8] + author = test_list[12].strip() + date = test_list[15].strip() + final_two[i] = [no_of_votes, + no_of_messages, title, tag1, tag2, author, date] + df2 = pd.DataFrame.from_dict(final_two, orient='index') + df2.columns = df_columns + df2.to_csv('total_proposals.csv') + + +if __name__ == "__main__": + df_columns = ['Votes', + 'Messages', 'Title', 'Tag1', 'Tag2', 'Author', 'Date'] + mydivs = scrape_divs() + selected_proposals(mydivs, df_columns) + total_proposals(mydivs, df_columns) + print("The proposals have been saved successfully!!!") diff --git a/Scripts/Web_Scrappers/Pycon_Proposals/requirements.txt b/Scripts/Web_Scrappers/Pycon_Proposals/requirements.txt new file mode 100644 index 000000000..bc0ac9fb5 --- /dev/null +++ b/Scripts/Web_Scrappers/Pycon_Proposals/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.9.1 +certifi==2020.6.20 +chardet==3.0.4 +idna==2.10 +requests==2.24.0 +soupsieve==2.0.1 +urllib3==1.25.10 +pandas==1.1.2 \ No newline at end of file