In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [2]:
icps = [
    {
        "industry": ["Healthcare Tech", "MedTech", "AI in Healthcare", "Wearable Tech"],
        "engagement_rate": "65-95",
        "company_size_employees": "100–800",
        "annual_revenue_usd": "10M–40M",
        "headquarters_location": "India",
        "technology_stack": ["Python", "AWS", "Kubernetes", "TensorFlow", "Edge AI", "FHIR", "IoT"],
        "target_designations": ["Chief Medical Officer", "CTO", "Head of AI", "Director of Product"],
        "pain_points": [
            "Medical device integration", "Data privacy compliance", "AI model explainability", "Real-time patient monitoring"
        ]
    },
    {
        "industry": ["FinTech", "Banking Tech", "Payments", "Blockchain"],
        "engagement_rate": "70-100",
        "company_size_employees": "300–2000",
        "annual_revenue_usd": "20M–100M",
        "headquarters_location": "India",
        "technology_stack": ["Blockchain", "React", "Node.js", "AWS", "Kafka", "Python", "Microservices"],
        "target_designations": ["Head of Payments", "VP of Engineering", "CTO", "Product Director"],
        "pain_points": [
            "Transaction latency", "Regulatory complexity", "Fraud detection automation", "Blockchain scalability"
        ]
    },
    {
        "industry": ["Manufacturing", "Industrial Automation", "IoT", "Robotics"],
        "engagement_rate": "55-85",
        "company_size_employees": "500–5000",
        "annual_revenue_usd": "50M–500M",
        "headquarters_location": "India",
        "technology_stack": ["IoT", "SCADA", "Edge Computing", "ROS", "C++", "Python", "Azure"],
        "target_designations": ["Operations Head", "VP of Engineering", "Automation Lead", "Chief Digital Officer"],
        "pain_points": [
            "Predictive maintenance", "Factory automation", "Legacy system modernization", "Data interoperability"
        ]
    },
    {
        "industry": ["Gaming", "Entertainment Tech", "AR/VR", "Cloud Gaming"],
        "engagement_rate": "80-100",
        "company_size_employees": "50–500",
        "annual_revenue_usd": "5M–50M",
        "headquarters_location": "India",
        "technology_stack": ["Unity", "Unreal Engine", "C++", "AWS", "Kubernetes", "WebRTC", "VR/AR SDKs"],
        "target_designations": ["CTO", "VP of Product", "Head of Game Development", "Lead Engineer"],
        "pain_points": [
            "Low latency streaming", "Cross-platform performance", "Scalability under peak load", "Monetization challenges"
        ]
    },
    {
        "industry": ["Logistics Tech", "Supply Chain", "Mobility", "Fleet Management"],
        "engagement_rate": "60-90",
        "company_size_employees": "200–1500",
        "annual_revenue_usd": "15M–80M",
        "headquarters_location": "India",
        "technology_stack": ["GPS Tracking", "IoT", "Java", "React", "AWS", "PostgreSQL", "Microservices"],
        "target_designations": ["VP of Operations", "CTO", "Fleet Manager", "Product Head"],
        "pain_points": [
            "Route optimization", "Asset tracking visibility", "Fuel cost management", "Predictive maintenance"
        ]
    }
]


icp_templates = {f"ICP{i+1}": icp for i, icp in enumerate(icps)}


In [3]:
import re

def parse_range(text):
    text = str(text).strip().replace("–", "-").replace(" ", "")
    if "M+" in text:
        base = float(text.replace("M+", "")) * 1e6
        return (base, float('inf'))
    elif "M" in text:
        nums = [float(x) * 1e6 for x in re.findall(r'[\d.]+', text)]
    elif "-" in text:
        nums = [int(x) for x in re.findall(r'\d+', text)]
    elif text.isdigit():
        return (int(text), int(text))
    else:
        return (0, float('inf'))

    if len(nums) == 1:
        return (nums[0], nums[0])
    return (nums[0], nums[-1])

def range_overlap(range1, range2):
    low = max(range1[0], range2[0])
    high = min(range1[1], range2[1])
    if low > high:
        return 0
    return (high - low) / (max(range1[1], range2[1]) - min(range1[0], range2[0]) + 1e-6)

def jaccard(list1, list2):
    set1 = set([x.strip().lower() for x in list1])
    set2 = set([x.strip().lower() for x in list2])
    if not set1 or not set2:
        return 0
    return len(set1 & set2) / len(set1 | set2)

def match_icp(input_icp, icp_templates):
    best_match = None
    best_score = -1

    for icp_name, icp in icp_templates.items():
        industry_score = jaccard(input_icp.get("industry", []), icp.get("industry", []))
        size_score = range_overlap(
            parse_range(input_icp.get("company_size_employees", "")),
            parse_range(icp.get("company_size_employees", ""))
        )
        revenue_score = range_overlap(
            parse_range(input_icp.get("annual_revenue_usd", "")),
            parse_range(icp.get("annual_revenue_usd", ""))
        )
        tech_score = jaccard(input_icp.get("technology_stack", []), icp.get("technology_stack", []))
        title_score = jaccard(input_icp.get("target_designations", []), icp.get("target_designations", []))
        country_score = 1 if input_icp.get("headquarters_location", "").lower() == icp.get("headquarters_location", "").lower() else 0
        engagement_score = range_overlap(
            parse_range(input_icp.get("engagement_rate", "")),
            parse_range(icp.get("engagement_rate", ""))
        )
        pain_score = jaccard(input_icp.get("pain_points", []), icp.get("pain_points", []))

        total_score = (industry_score + size_score + revenue_score + tech_score +
                       title_score + country_score + engagement_score + pain_score) / 8

        if total_score > best_score:
            best_score = total_score
            best_match = icp_name

    return best_match, round(best_score, 3)

In [4]:
class ICPNet(nn.Module):
    def __init__(self):
        super(ICPNet,self).__init__()
        self.fc1=nn.Linear(14,24)
        self.relu=nn.ReLU()
        self.fc2=nn.Linear(24,1)
        self.sig=nn.Sigmoid()
    def forward(self,x):
        x=self.fc1(x)
        x=self.relu(x)
        x=self.fc2(x)
        x=self.sig(x)
        x=x.squeeze()
        return x;

In [5]:

def train_and_get_icp_contacts(csv_path, icp_column="ICP3"):
    contacts = pd.read_csv(csv_path)
    contacts[icp_column]=contacts[icp_column].fillna(contacts[icp_column].median())
    X = contacts.drop(columns=["first_name","last_name","city","state","company_address","company_phone","ICP1","ICP2","ICP3","ICP4","ICP5"])

    categorical_cols = ['title', 'industry', 'keywords', 'technologies','company','seniority','country','departments','pain_points']
    label_encoders = {}

    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str)) 
        label_encoders[col] = le

    Y = contacts[icp_column]
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.1, random_state=42, stratify=Y
    )

    numeric_cols = [col for col in X.columns if col not in categorical_cols]
    scaler = StandardScaler()
    X_train.loc[:, numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test.loc[:, numeric_cols] = scaler.transform(X_test[numeric_cols]) 

    X_train_np = X_train.values
    X_test_np = X_test.values
    Y_train_np = Y_train.values
    Y_test_np = Y_test.values
    X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32, requires_grad=True)
    Y_train_tensor = torch.tensor(Y_train_np, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32, requires_grad=True)
    Y_test_tensor = torch.tensor(Y_test_np, dtype=torch.long)

    model= ICPNet()
    criterion=nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    epochs = 1000
    for epoch in range(epochs):
        model.train()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, Y_train_tensor.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        y_pred_classes = (y_pred >= 0.5).float().numpy().flatten()


    alpha=Y_test_tensor
    z=y_pred_classes
    k=len(alpha)
        
    accuracy = accuracy_score(Y_test, y_pred_classes)
    print(f"Model Accuracy on Test Set for {icp_column}: {accuracy * 100:.2f}%")

    selected_indices = np.where(y_pred_classes == 1)[0]

    selected_contacts = contacts.iloc[X_test.index].iloc[selected_indices]
    contact_dicts = selected_contacts[[
        "first_name", "last_name", "title", "company", "company_phone", "city", "state", "country"
    ]].to_dict(orient="records")

    print(f"\nNumber of selected contacts from {icp_column}: {len(contact_dicts)}")
    for d in contact_dicts[:5]:
        print(d)


    

In [6]:
input_icp  = {
    "industry": ["Fleet Management", "Transport Analytics"],
    "engagement_rate": "65-85",
    "company_size_employees": "300–1200",
    "annual_revenue_usd": "20M–50M",
    "headquarters_location": "India",
    "technology_stack": ["IoT", "Java", "React", "Microservices", "PostgreSQL"],
    "target_designations": ["Product Head", "CTO", "Mobility Lead"],
    "pain_points": ["Asset tracking visibility", "Fuel monitoring", "Route planning"]
}
other= {
    "industry": ["FinTech", "Banking Tech", "Payments", "Blockchain"],
    "engagement_rate": "98",
    "company_size_employees": "1600",
    "annual_revenue_usd": "90M",
    "headquarters_location": "India",
    "technology_stack": ["Blockchain", "React", "Node.js", "AWS", "Kafka", "Python", "Microservices"],
    "target_designations": ["CTO"],
    "pain_points": [
        "Transaction latency",
        "Regulatory complexity",
        "Fraud detection automation",
        "Blockchain scalability"
    ]
}

input_icp_matched=match_icp(other,icp_templates)
print(input_icp_matched[1])
train_and_get_icp_contacts("noisy_icp_dataset.csv", input_icp_matched[0])


0.531


 -0.57646935 -0.86070606  0.53932181  0.48131431 -0.85422286 -0.51470843
  2.16728498 -0.5706686   0.63588722  0.50281121 -0.354676   -0.8415977
 -0.88288539 -0.69146067  1.96869463  0.31036283 -0.75731623 -0.0639561
 -0.68293016 -0.40551786  2.16626132 -0.826584   -0.81941837 -0.88390905
  1.70254261 -0.53449922 -0.86514192  0.30183231 -0.94362265 -0.81259396
 -0.78700242 -0.51812064  0.86450498  1.04023355 -0.84364503 -0.87913196
 -0.83067865 -0.60854408 -0.17997108 -0.82180691 -0.60001357 -0.62492266
 -0.87333121 -0.61195628 -0.2035153  -0.64505468  1.34323739 -0.46625511
  1.34869692 -0.87606098  0.93411397  1.6407817  -0.23251905 -0.94635241
 -0.89585177  0.4693716  -0.27585406  0.06502526  1.17365078 -0.47205586
 -0.45567728  1.70220139 -0.25196862 -0.83306719 -0.84125648  1.18013397
 -0.88595637  0.09880609  2.10381797 -0.90642961  1.48074927 -0.40005833
  2.36792266 -0.63857149 -0.17075813  1.96937707 -0.79587415 -0.7235354
  1.28557112  2.27340457  1.50599959 -0.81634739 -0.30

Model Accuracy on Test Set for ICP2: 83.33%

Number of selected contacts from ICP2: 6
{'first_name': 'Anil', 'last_name': 'Reddy', 'title': 'VP of Operations', 'company': 'Company511', 'company_phone': '+91 9545894604', 'city': 'Hyderabad', 'state': 'Telangana', 'country': 'India'}
{'first_name': 'Ritu', 'last_name': 'Patel', 'title': 'VP of Operations', 'company': 'Company574', 'company_phone': '+91 9153076494', 'city': 'Pune', 'state': 'Maharashtra', 'country': 'India'}
{'first_name': 'Raj', 'last_name': 'Patel', 'title': 'Head of Payments', 'company': 'Company872', 'company_phone': '+91 7323275742', 'city': 'Hyderabad', 'state': 'Telangana', 'country': 'India'}
{'first_name': 'Kiran', 'last_name': 'Nair', 'title': 'QA Tester', 'company': 'Company525', 'company_phone': '+91 9875598385', 'city': 'Bangalore', 'state': 'Karnataka', 'country': 'India'}
{'first_name': 'Raj', 'last_name': 'Singh', 'title': 'VP of Product', 'company': 'Company565', 'company_phone': '+91 7491844695', 'city':