In [1]:
import requests
from bs4 import BeautifulSoup
import whois
from datetime import datetime
import pandas as pd
import re
import time
from urllib.parse import quote


In [2]:

# --- Constants ---
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
DELAY = 2  # Seconds between requests

# --- Verification Functions ---

def check_mca_registration(company_name):
    """Ministry of Corporate Affairs verification"""
    try:
        url = f"https://www.mca.gov.in/mcafoportal/companySearch.do?companyName={quote(company_name)}"
        response = requests.get(url, headers=HEADERS)
        time.sleep(DELAY)
        
        if "No matching records found" in response.text:
            return {"status": "UNREGISTERED", "confidence": 0, "source": "MCA"}
        return {
            "status": "REGISTERED", 
            "confidence": 90, 
            "source": "MCA",
            "detail": "Legally registered with MCA"
        }
    except Exception as e:
        return {"error": str(e), "confidence": 0, "source": "MCA"}



In [3]:
def check_msme_registration(company_name):
    """Udyam (MSME) registration verification"""
    try:
        # First get the search page to handle cookies
        search_url = "https://udyamregistration.gov.in/Udyam_Verify.aspx"
        session = requests.Session()
        session.get(search_url, headers=HEADERS)
        time.sleep(DELAY)
        
        # Then perform the search
        search_params = {
            "ctl00$MainContent$txtUdyam": company_name,
            "ctl00$MainContent$btnSearch": "Search"
        }
        response = session.post(search_url, data=search_params, headers=HEADERS)
        time.sleep(DELAY)
        
        soup = BeautifulSoup(response.text, 'html.parser')
        results_table = soup.find("table", {"id": "MainContent_GridView1"})
        
        if not results_table:
            return {"status": "UNREGISTERED", "confidence": 30, "source": "MSME"}
        
        # Extract registration details
        rows = results_table.find_all("tr")[1:]  # Skip header
        registrations = []
        
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 6:
                registrations.append({
                    "udyam_number": cols[0].get_text(strip=True),
                    "name": cols[1].get_text(strip=True),
                    "type": cols[2].get_text(strip=True),
                    "registration_date": cols[3].get_text(strip=True),
                    "pan": cols[4].get_text(strip=True),
                    "status": cols[5].get_text(strip=True)
                })
        
        if registrations:
            best_match = max(registrations, 
                           key=lambda x: fuzz.ratio(x['name'].lower(), company_name.lower()))
            return {
                "status": "REGISTERED",
                "confidence": 85,
                "source": "MSME",
                "detail": "Registered as MSME",
                "registrations": registrations,
                "best_match": best_match
            }
        return {"status": "UNREGISTERED", "confidence": 30, "source": "MSME"}
    except Exception as e:
        return {"error": str(e), "confidence": 0, "source": "MSME"}



In [4]:
def check_rbi_nbfc(company_name):
    """RBI NBFC verification"""
    try:
        nbfc_lists = [
            ("Active NBFCs", "https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009"),
            ("Microfinance NBFCs", "https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2078")
        ]
        
        for list_name, list_url in nbfc_lists:
            response = requests.get(list_url, headers=HEADERS)
            time.sleep(DELAY)
            if company_name.upper() in response.text:
                return {
                    "is_registered": True,
                    "confidence": 95,
                    "source": "RBI",
                    "list": list_name,
                    "detail": f"Found in RBI {list_name} registry"
                }
        
        return {
            "is_registered": False,
            "confidence": 70,
            "source": "RBI",
            "detail": "Not found in RBI NBFC lists"
        }
    except Exception as e:
        return {"error": str(e), "confidence": 0, "source": "RBI"}



In [5]:
def scrape_zauba(company_name):
    """Zauba Corp trade history check"""
    try:
        url = f"https://www.zauba.com/company-{company_name.replace(' ', '-')}"
        response = requests.get(url, headers=HEADERS)
        time.sleep(DELAY)
        
        soup = BeautifulSoup(response.text, 'html.parser')
        no_data = soup.find(string=re.compile("No records found"))
        
        if no_data:
            return {
                "has_trade_history": False,
                "confidence": 30,
                "source": "Zauba",
                "detail": "No import/export records found"
            }
        else:
            return {
                "has_trade_history": True,
                "confidence": 80,
                "source": "Zauba",
                "detail": "Active trade history exists"
            }
    except Exception as e:
        return {"error": str(e), "confidence": 0, "source": "Zauba"}



In [6]:
def analyze_domain(company_name):
    """Domain registration analysis"""
    try:
        domain = f"{company_name.replace(' ', '').lower()}.com"
        w = whois.whois(domain)
        
        creation_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
        age_days = (datetime.now() - creation_date).days if creation_date else 0
        age_years = round(age_days / 365, 1)
        
        return {
            "domain": domain,
            "age_years": age_years,
            "registrar": w.registrar,
            "confidence": min(100, age_days / 365 * 100),
            "source": "WHOIS",
            "detail": f"Domain registered for {age_years} years with {w.registrar}"
        }
    except:
        return {
            "error": "Domain invalid", 
            "confidence": 0, 
            "source": "WHOIS",
            "detail": "Could not verify domain registration"
        }



In [None]:
def search_news(company_name):
    """News media scam check"""
    try:
        sources = [
            ("Livemint", f"https://www.livemint.com/search/{quote(company_name)}"),
            ("Economic Times", f"https://economictimes.indiatimes.com/topic/{quote(company_name)}")
        ]
        scam_reports = []
        
        for source_name, url in sources:
            response = requests.get(url, headers=HEADERS)
            time.sleep(DELAY)
            if "scam" in response.text.lower():
                scam_reports.append({"source": source_name, "url": url})
        
        confidence = 100 - len(scam_reports) * 25
        return {
            "scam_reports": scam_reports,
            "confidence": max(0, confidence),
            "source": "News Media",
            "detail": f"{len(scam_reports)} scam-related reports found" if scam_reports else "No scam reports found"
        }
    except Exception as e:
        return {"error": str(e), "confidence": 50, "source": "News Media"}



In [8]:
def generate_verification_report(company_name):
    """Generate detailed verification report"""
    checks = {
        "mca": check_mca_registration(company_name),
        "msme": check_msme_registration(company_name),
        "rbi_nbfc": check_rbi_nbfc(company_name),
        "zauba": scrape_zauba(company_name),
        "domain": analyze_domain(company_name),
        "news": search_news(company_name)
    }
    
    # Calculate composite score with dynamic weights
    weights = {
        "mca": 0.35,
        "msme": 0.25 if checks["msme"].get("status") == "REGISTERED" else 0.15,
        "rbi_nbfc": 0.3 if checks["rbi_nbfc"].get("is_registered", False) else 0.1,
        "domain": 0.2,
        "zauba": 0.15,
        "news": 0.05
    }
    
    composite_score = sum(
        check.get("confidence", 0) * weights[key] 
        for key, check in checks.items()
    )
    composite_score = min(100, composite_score * (1 / sum(weights.values())))
    
    # Generate verdict
    if composite_score >= 75:
        verdict = "✅ LEGITIMATE"
    elif composite_score >= 50:
        verdict = "⚠️ NEEDS REVIEW"
    else:
        verdict = "🚩 HIGH RISK"
    
    # Format detailed report
    report = {
        "company_name": company_name,
        "verdict": verdict,
        "composite_score": round(composite_score, 1),
        "detailed_checks": checks,
        "weighting_used": {k: f"{round(v*100)}%" for k, v in weights.items()}
    }
    
    return report



In [9]:
# --- Pretty Print Function ---
def print_detailed_report(report):
    """Human-readable report formatting"""
    print(f"\n{'='*50}")
    print(f"VERIFICATION REPORT: {report['company_name']}")
    print(f"FINAL VERDICT: {report['verdict']} (Score: {report['composite_score']}/100)")
    print(f"{'='*50}\n")
    
    print("WEIGHTING USED:")
    for source, weight in report["weighting_used"].items():
        print(f"- {source.upper():<10}: {weight}")
    
    print("\nDETAILED FINDINGS:")
    for check_name, result in report["detailed_checks"].items():
        print(f"\n{check_name.upper()} ({result['source']}):")
        if "error" in result:
            print(f"  🔴 Error: {result['error']}")
        else:
            print(f"  🟢 Confidence: {result['confidence']}/100")
            for k, v in result.items():
                if k not in ["confidence", "source"]:
                    if k == "registrations" and isinstance(v, list):
                        print("  - Registrations:")
                        for reg in v[:2]:  # Show first 2 registrations max
                            print(f"    • {reg['name']} ({reg['udyam_number']})")
                        if len(v) > 2:
                            print(f"    • +{len(v)-2} more...")
                    else:
                        print(f"  - {k.replace('_', ' ').title()}: {v}")


In [10]:
# --- Example Usage ---
if __name__ == "__main__":
    company = input("Enter the company name: ")
    report = generate_verification_report(company)
    print_detailed_report(report)
    time.sleep(DELAY * 3)  # Avoid rate limiting
    
    # Save to JSON
    pd.DataFrame([report]).to_json(f"{company}_report.json", indent=2)
    print(f"\nReport saved to {company}_report.json\n{'='*50}\n")


VERIFICATION REPORT: KIA
FINAL VERDICT: ✅ LEGITIMATE (Score: 80.0/100)

WEIGHTING USED:
- MCA       : 35%
- MSME      : 15%
- RBI_NBFC  : 10%
- DOMAIN    : 20%
- ZAUBA     : 15%
- NEWS      : 5%

DETAILED FINDINGS:

MCA (MCA):
  🟢 Confidence: 90/100
  - Status: REGISTERED
  - Detail: Legally registered with MCA

MSME (MSME):
  🟢 Confidence: 30/100
  - Status: UNREGISTERED

RBI_NBFC (RBI):
  🟢 Confidence: 70/100
  - Is Registered: False
  - Detail: Not found in RBI NBFC lists

ZAUBA (Zauba):
  🟢 Confidence: 80/100
  - Has Trade History: True
  - Detail: Active trade history exists

DOMAIN (WHOIS):
  🟢 Confidence: 100/100
  - Domain: kia.com
  - Age Years: 28.5
  - Registrar: Megazone Corp., dba HOSTING.KR
  - Detail: Domain registered for 28.5 years with Megazone Corp., dba HOSTING.KR

NEWS (News Media):
  🟢 Confidence: 100/100
  - Scam Reports: []
  - Detail: No scam reports found

Report saved to KIA_report.json

