Taiyo-ai · buhtig47 · Sep 10, 2023 · Sep 10, 2023 · Sep 10, 2023 · Sep 10, 2023
diff --git a/data/sample.txt b/data/sample.txt
diff --git a/dummy-data-product/src/.env b/dummy-data-product/src/.env
@@ -0,0 +1,3 @@
+geckodriver_path = "C:\Users\AYUSH\Downloads\geckodriver-v0.33.0-win64\geckodriver.exe"
+url = "https://etenders.gov.in/eprocure/app?page=FrontEndListTendersbyDate&service=page"
+csv_file_path = "tender_data.csv"
diff --git a/dummy-data-product/src/client.py b/dummy-data-product/src/client.py
@@ -4,23 +4,45 @@
 from datetime import datetime
 
 # Importing scraping and data processing modules
-# from dependencies.scraping.<file_name> import <class_name>
-# from dependencies.scraping.<file_name> import <class_name>
-# from dependencies.cleaning.<file_name> import <class_name>
-# from dependencies.geocoding.<file_name> import <class_name>
-# from dependencies.standardization.<file_name> import <class_name>
+from src.dependencies.scraping.scraper import TenderScraper
+from src.dependencies.cleaning.cleaner import *
+from src.dependencies.geocoding.geocoder import LocationExtractor
+from src.dependencies.standardization.standardizer import SectorExtractor
 
 dotenv.load_dotenv(".env")
 logging.basicConfig(level=logging.INFO)
 
-
+import csv
+import os
+import pandas as pd
+import re
 # In each step create an object of the class, initialize the class with 
 # required configuration and call the run method 
 def step_1():
     logging.info("Scraped Metadata")
 
 
 def step_2():
+    geckodriver_path = os.getenv("geckodriver_path") 
+    url = os.getenv("url") 
+
+    scraper = TenderScraper(geckodriver_path, url)
+    scraper.start()
+    tender_data = scraper.scrape_tenders()
+
+    # Specify the CSV file path
+    csv_file_path = os.getenv("csv_file_path")
+
+    # Save the data to a CSV file
+    with open(csv_file_path, mode='w', newline='') as csv_file:
+        fieldnames = ["S.No", "e-Published Date", "Bid Submission Closing Date", "Tender Opening Date", "Title and Ref.No./Tender ID", "Details"]
+        writer = csv.writer(csv_file)
+        writer.writerow(fieldnames)
+        for tender in tender_data:
+            writer.writerow(tender)
+
+    print(f"Data saved to {csv_file_path}")
+
     logging.info("Scraped Main Data")
 
 
@@ -29,11 +51,78 @@ def step_3():
 
 
 def step_4():
-    logging.info("Geocoded Cleaned Data")
+    # Example usage
+    extractor = LocationExtractor()
+
+    try:
+        # Read CSV file into a pandas DataFrame
+        df = pd.read_csv(os.getenv("csv_file_path"))
+
+        # Initialize an empty list to store locations
+        locations_list = []
+
+        # Iterate over rows and process the "details" column
+        for _, row in df.iterrows():
+            details = row['Details']
+            locations = extractor.process_and_extract(details)
+
+            # Check if locations were found, and if not, add "NA"
+            if not locations:
+                locations = "NA"
+
+            locations_list.append(locations)
+
+        # Add the extracted locations as a new column "locations"
+        df['Locations'] = locations_list
+
+        # Save the modified DataFrame back to the CSV file
+        df.to_csv(os.getenv("csv_file_path"), index=False)
+
+        logging.info("Geocoded Cleaned Data")
+
+    except Exception as e:
+        logging.info(e)
 
 
 def step_5():
-    logging.info("Standardized Geocoded Data")
+    try:
+        # Example usage
+        extractor = SectorExtractor()
+
+        # Read CSV file into a pandas DataFrame
+        df = pd.read_csv(os.getenv("csv_file_path"))
+
+        # Initialize lists to store sector and subsector
+        sector_list = []
+        subsector_list = []
+
+        # Iterate over rows and process the "Title and Ref.No./Tender ID" column
+        for index, row in df.iterrows():
+            title_ref = row['Title and Ref.No./Tender ID']
+            # Extract the first bracket value from the title_ref
+            match = re.search(r'\[(.*?)\]', title_ref)
+            if match:
+                title_ref = match.group(1)
+            else:
+                title_ref = ""
+
+            sector, subsector = extractor.extract_sector_subsector(title_ref)
+
+            # Check if sector and subsector were extracted, otherwise use "NA"
+            sector_list.append(sector if sector else "NA")
+            subsector_list.append(subsector if subsector else "NA")
+
+        # Add the extracted or "NA" sector and subsector to the DataFrame
+        df['Sector'] = sector_list
+        df['Subsector'] = subsector_list
+
+        # Save the modified DataFrame back to the CSV file
+        df.to_csv(os.getenv("csv_file_path"), index=False)
+
+        logging.info("Standardized Geocoded Data")
+
+    except Exception as e:
+        logging.info(e)    
 
 
 if __name__ == "__main__":

diff --git a/dummy-data-product/src/dependencies/geocoding/geocoder.py b/dummy-data-product/src/dependencies/geocoding/geocoder.py
@@ -0,0 +1,25 @@
+import spacy
+import re
+
+class LocationExtractor:
+    def __init__(self, model_path="en_core_web_sm"):
+        self.nlp = spacy.load(model_path)
+
+    def process_text(self, text):
+        # Replace non-alphabet characters with a blank space
+        cleaned_text = re.sub(r'[^a-zA-Z ]', ' ', text)
+        return cleaned_text.upper()
+
+    def extract_locations(self, text):
+        # Process the text
+        doc = self.nlp(text)
+
+        # Extract location entities
+        locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
+
+        return locations
+
+    def process_and_extract(self, text):
+        cleaned_text = self.process_text(text)
+        locations = self.extract_locations(cleaned_text)
+        return locations
diff --git a/dummy-data-product/src/dependencies/scraping/scraper.py b/dummy-data-product/src/dependencies/scraping/scraper.py
@@ -0,0 +1,68 @@
+import re
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
+
+
+class TenderScraper:
+    def __init__(self, geckodriver_path, url):
+        self.geckodriver_path = geckodriver_path
+        self.url = url
+        self.options = Options()
+        self.options.headless = True
+        self.driver = None
+
+    def start(self):
+        self.driver = webdriver.Firefox(
+            executable_path=self.geckodriver_path, options=self.options)
+        self.driver.get(self.url)
+        self.driver.maximize_window()
+
+    def extract_tender_info(self, row_data):
+        sno = re.findall(r'\b\d+\.\s', row_data)
+        row_data = re.sub(r'\b\d+\.\s', '', row_data)
+        all_dates = re.findall(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2} [APap][Mm]', row_data)
+        row_data = re.sub(r'\d{1,2}-[A-Za-z]{3}-\d{4} \d{2}:\d{2} [APap][Mm]', '', row_data)
+        two_brkt = re.findall(r'\[.*?\] \[.*?\]', row_data)
+        row_data = re.sub(r'\[.*?\] \[.*?\]', '', row_data)
+        last_brkt = re.findall(r'\[.*?\]', row_data)
+        row_data = re.sub(r'\[.*?\]', '', row_data)
+
+        return [sno[0].strip(), all_dates[0], all_dates[1], all_dates[2], 
+                f"{two_brkt[0]} {last_brkt[0]}", row_data.strip()]
+
+    def scrape_tenders(self):
+        try:
+            data = []
+            while True:
+                wait = WebDriverWait(self.driver, 10)
+                span_element = wait.until(EC.presence_of_element_located(
+                    (By.XPATH, '//*[@id="LinkSubmit_1"]/span')))
+                span_element.click()
+
+                rows = wait.until(EC.presence_of_all_elements_located(
+                    (By.CSS_SELECTOR, 'tr.odd, tr.even')))
+
+                for row in rows:
+                    row_data = row.text
+
+                    tender_info = self.extract_tender_info(row_data)
+
+                    if tender_info:
+                        data.append(tender_info)
+
+                link_element = self.driver.find_elements(By.ID, "linkFwd")
+                if not link_element:
+                    break
+
+                link_element[0].click()
+
+            return data
+
+        except NoSuchElementException:
+            print("No more new data to load.")
+        finally:
+            self.driver.quit()
diff --git a/dummy-data-product/src/dependencies/standardization/standardizer.py b/dummy-data-product/src/dependencies/standardization/standardizer.py
@@ -0,0 +1,35 @@
+import spacy
+
+class SectorExtractor:
+    def __init__(self, model_path="en_core_web_sm"):
+        # Load the spaCy language model
+        self.nlp = spacy.load(model_path)
+
+    def extract_sector_subsector(self, text):
+        # Process the text with spaCy
+        doc = self.nlp(text)
+
+        # Initialize variables to store sector and subsector
+        sector = None
+        subsector = None
+
+        # Define keywords that may indicate the start of the subsector
+        subsector_start_keywords = ["in", "for", "at", "to", "with", "by", "of"]
+
+        # Iterate through tokens in the text
+        for token in doc:
+            if token.ent_type_ == "ORG" and not sector:
+                # Check if the organization is a suitable sector
+                sector = token.text
+            elif token.ent_type_ != "ORG":
+                # Check if the token is in the subsector_start_keywords
+                if token.text.lower() in subsector_start_keywords:
+                    # Look for subsector in the following tokens
+                    subsector_tokens = []
+                    for next_token in token.doc[token.i + 1:]:
+                        if next_token.ent_type_ == "ORG":
+                            break
+                        subsector_tokens.append(next_token.text)
+                    subsector = " ".join(subsector_tokens)
+
+        return sector, subsector
diff --git a/dummy-data-product/src/requirements.txt b/dummy-data-product/src/requirements.txt
@@ -0,0 +1,51 @@
+annotated-types==0.5.0
+attrs==23.1.0
+blis==0.7.10
+catalogue==2.0.9
+certifi==2023.7.22
+cffi==1.15.1
+charset-normalizer==3.2.0
+click==8.1.7
+colorama==0.4.6
+confection==0.1.3
+cymem==2.0.7
+exceptiongroup==1.1.3
+h11==0.14.0
+idna==3.4
+Jinja2==3.1.2
+langcodes==3.3.0
+MarkupSafe==2.1.3
+murmurhash==1.0.9
+numpy==1.25.2
+outcome==1.2.0
+packaging==23.1
+pandas==2.1.0
+pathy==0.10.2
+preshed==3.0.8
+pycparser==2.21
+pydantic==2.3.0
+pydantic_core==2.6.3
+PySocks==1.7.1
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3.post1
+requests==2.31.0
+selenium==4.12.0
+six==1.16.0
+smart-open==6.4.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+spacy==3.6.1
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+srsly==2.4.7
+thinc==8.1.12
+tqdm==4.66.1
+trio==0.22.2
+trio-websocket==0.10.4
+typer==0.9.0
+typing_extensions==4.7.1
+tzdata==2023.3
+urllib3==2.0.4
+wasabi==1.1.2
+wsproto==1.2.0