Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{Ayush Srivastava}-{eProcurement_GOI}-{10/Sep/23} #121

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,636 changes: 1,633 additions & 3 deletions data/sample.txt

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions dummy-data-product/src/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
geckodriver_path = "C:\Users\AYUSH\Downloads\geckodriver-v0.33.0-win64\geckodriver.exe"
url = "https://etenders.gov.in/eprocure/app?page=FrontEndListTendersbyDate&service=page"
csv_file_path = "tender_data.csv"
105 changes: 97 additions & 8 deletions dummy-data-product/src/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,45 @@
from datetime import datetime

# Importing scraping and data processing modules
# from dependencies.scraping.<file_name> import <class_name>
# from dependencies.scraping.<file_name> import <class_name>
# from dependencies.cleaning.<file_name> import <class_name>
# from dependencies.geocoding.<file_name> import <class_name>
# from dependencies.standardization.<file_name> import <class_name>
from src.dependencies.scraping.scraper import TenderScraper
from src.dependencies.cleaning.cleaner import *
from src.dependencies.geocoding.geocoder import LocationExtractor
from src.dependencies.standardization.standardizer import SectorExtractor

dotenv.load_dotenv(".env")
logging.basicConfig(level=logging.INFO)


import csv
import os
import pandas as pd
import re
# In each step create an object of the class, initialize the class with
# required configuration and call the run method
def step_1():
logging.info("Scraped Metadata")


def step_2():
geckodriver_path = os.getenv("geckodriver_path")
url = os.getenv("url")

scraper = TenderScraper(geckodriver_path, url)
scraper.start()
tender_data = scraper.scrape_tenders()

# Specify the CSV file path
csv_file_path = os.getenv("csv_file_path")

# Save the data to a CSV file
with open(csv_file_path, mode='w', newline='') as csv_file:
fieldnames = ["S.No", "e-Published Date", "Bid Submission Closing Date", "Tender Opening Date", "Title and Ref.No./Tender ID", "Details"]
writer = csv.writer(csv_file)
writer.writerow(fieldnames)
for tender in tender_data:
writer.writerow(tender)

print(f"Data saved to {csv_file_path}")

logging.info("Scraped Main Data")


Expand All @@ -29,11 +51,78 @@ def step_3():


def step_4():
logging.info("Geocoded Cleaned Data")
# Example usage
extractor = LocationExtractor()

try:
# Read CSV file into a pandas DataFrame
df = pd.read_csv(os.getenv("csv_file_path"))

# Initialize an empty list to store locations
locations_list = []

# Iterate over rows and process the "details" column
for _, row in df.iterrows():
details = row['Details']
locations = extractor.process_and_extract(details)

# Check if locations were found, and if not, add "NA"
if not locations:
locations = "NA"

locations_list.append(locations)

# Add the extracted locations as a new column "locations"
df['Locations'] = locations_list

# Save the modified DataFrame back to the CSV file
df.to_csv(os.getenv("csv_file_path"), index=False)

logging.info("Geocoded Cleaned Data")

except Exception as e:
logging.info(e)


def step_5():
logging.info("Standardized Geocoded Data")
try:
# Example usage
extractor = SectorExtractor()

# Read CSV file into a pandas DataFrame
df = pd.read_csv(os.getenv("csv_file_path"))

# Initialize lists to store sector and subsector
sector_list = []
subsector_list = []

# Iterate over rows and process the "Title and Ref.No./Tender ID" column
for index, row in df.iterrows():
title_ref = row['Title and Ref.No./Tender ID']
# Extract the first bracket value from the title_ref
match = re.search(r'\[(.*?)\]', title_ref)
if match:
title_ref = match.group(1)
else:
title_ref = ""

sector, subsector = extractor.extract_sector_subsector(title_ref)

# Check if sector and subsector were extracted, otherwise use "NA"
sector_list.append(sector if sector else "NA")
subsector_list.append(subsector if subsector else "NA")

# Add the extracted or "NA" sector and subsector to the DataFrame
df['Sector'] = sector_list
df['Subsector'] = subsector_list

# Save the modified DataFrame back to the CSV file
df.to_csv(os.getenv("csv_file_path"), index=False)

logging.info("Standardized Geocoded Data")

except Exception as e:
logging.info(e)


if __name__ == "__main__":
Expand Down
25 changes: 25 additions & 0 deletions dummy-data-product/src/dependencies/geocoding/geocoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import spacy
import re

class LocationExtractor:
def __init__(self, model_path="en_core_web_sm"):
self.nlp = spacy.load(model_path)

def process_text(self, text):
# Replace non-alphabet characters with a blank space
cleaned_text = re.sub(r'[^a-zA-Z ]', ' ', text)
return cleaned_text.upper()

def extract_locations(self, text):
# Process the text
doc = self.nlp(text)

# Extract location entities
locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]

return locations

def process_and_extract(self, text):
cleaned_text = self.process_text(text)
locations = self.extract_locations(cleaned_text)
return locations
68 changes: 68 additions & 0 deletions dummy-data-product/src/dependencies/scraping/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


class TenderScraper:
def __init__(self, geckodriver_path, url):
self.geckodriver_path = geckodriver_path
self.url = url
self.options = Options()
self.options.headless = True
self.driver = None

def start(self):
self.driver = webdriver.Firefox(
executable_path=self.geckodriver_path, options=self.options)
self.driver.get(self.url)
self.driver.maximize_window()

def extract_tender_info(self, row_data):
sno = re.findall(r'\b\d+\.\s', row_data)
row_data = re.sub(r'\b\d+\.\s', '', row_data)
all_dates = re.findall(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2} [APap][Mm]', row_data)
row_data = re.sub(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2} [APap][Mm]', '', row_data)
two_brkt = re.findall(r'\[.*?\] \[.*?\]', row_data)
row_data = re.sub(r'\[.*?\] \[.*?\]', '', row_data)
last_brkt = re.findall(r'\[.*?\]', row_data)
row_data = re.sub(r'\[.*?\]', '', row_data)

return [sno[0].strip(), all_dates[0], all_dates[1], all_dates[2],
f"{two_brkt[0]} {last_brkt[0]}", row_data.strip()]

def scrape_tenders(self):
try:
data = []
while True:
wait = WebDriverWait(self.driver, 10)
span_element = wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="LinkSubmit_1"]/span')))
span_element.click()

rows = wait.until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, 'tr.odd, tr.even')))

for row in rows:
row_data = row.text

tender_info = self.extract_tender_info(row_data)

if tender_info:
data.append(tender_info)

link_element = self.driver.find_elements(By.ID, "linkFwd")
if not link_element:
break

link_element[0].click()

return data

except NoSuchElementException:
print("No more new data to load.")
finally:
self.driver.quit()
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import spacy

class SectorExtractor:
def __init__(self, model_path="en_core_web_sm"):
# Load the spaCy language model
self.nlp = spacy.load(model_path)

def extract_sector_subsector(self, text):
# Process the text with spaCy
doc = self.nlp(text)

# Initialize variables to store sector and subsector
sector = None
subsector = None

# Define keywords that may indicate the start of the subsector
subsector_start_keywords = ["in", "for", "at", "to", "with", "by", "of"]

# Iterate through tokens in the text
for token in doc:
if token.ent_type_ == "ORG" and not sector:
# Check if the organization is a suitable sector
sector = token.text
elif token.ent_type_ != "ORG":
# Check if the token is in the subsector_start_keywords
if token.text.lower() in subsector_start_keywords:
# Look for subsector in the following tokens
subsector_tokens = []
for next_token in token.doc[token.i + 1:]:
if next_token.ent_type_ == "ORG":
break
subsector_tokens.append(next_token.text)
subsector = " ".join(subsector_tokens)

return sector, subsector
51 changes: 51 additions & 0 deletions dummy-data-product/src/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
annotated-types==0.5.0
attrs==23.1.0
blis==0.7.10
catalogue==2.0.9
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
click==8.1.7
colorama==0.4.6
confection==0.1.3
cymem==2.0.7
exceptiongroup==1.1.3
h11==0.14.0
idna==3.4
Jinja2==3.1.2
langcodes==3.3.0
MarkupSafe==2.1.3
murmurhash==1.0.9
numpy==1.25.2
outcome==1.2.0
packaging==23.1
pandas==2.1.0
pathy==0.10.2
preshed==3.0.8
pycparser==2.21
pydantic==2.3.0
pydantic_core==2.6.3
PySocks==1.7.1
python-dateutil==2.8.2
python-dotenv==1.0.0
pytz==2023.3.post1
requests==2.31.0
selenium==4.12.0
six==1.16.0
smart-open==6.4.0
sniffio==1.3.0
sortedcontainers==2.4.0
spacy==3.6.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
srsly==2.4.7
thinc==8.1.12
tqdm==4.66.1
trio==0.22.2
trio-websocket==0.10.4
typer==0.9.0
typing_extensions==4.7.1
tzdata==2023.3
urllib3==2.0.4
wasabi==1.1.2
wsproto==1.2.0