In [None]:
import pandas as pd
import numpy as np
import spacy
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
import json
from collections import defaultdict

In [None]:
# Generate summary corpus

# Load spaCy English language model for NLP processing
nlp = spacy.load('en_core_web_sm')

bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
print("done")
# Load the Excel file
file_path = '/content/drive/MyDrive/BTech_Project/data.xlsx'
data = pd.read_excel(file_path)
data = data.head(500)

# Function to clean and prepare text for summarization
def prepare_text(row):
    text = f"""
    {row['LinkedIn Name']} is currently working as {row['Description']} at {row['Organisation']}.
    Based in {row['Location']}, they are part of the {row['Industry']} industry.
    In their current role as {row['Current Role(s)']}, they have been with the company for {row['Tenure at Company']}.
    Their background includes: {row['About']}.
    """
    # Clean the text using NLP (removing stop words, punctuation, etc.)
    doc = nlp(text)
    cleaned_text = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct])
    return cleaned_text

# Function to generate a detailed and advanced summarized corpus using BART
def generate_bart_summary(text):
    inputs = bart_tokenizer(text, max_length=1024, return_tensors="pt", truncation=True)
    summary_ids = bart_model.generate(inputs["input_ids"], num_beams=4, max_length=150, early_stopping=True)
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to create a summarized corpus for each person
def generate_summarized_corpus(row):
    text = prepare_text(row)
    # Using BART to summarize the cleaned text
    summary = generate_bart_summary(text)
    return summary

# Create a summarized corpus for each person in the dataset
data['Corpus'] = data.apply(generate_summarized_corpus, axis=1)

# Number of people (nodes)
num_people = len(data)

# Adjacency list to store connections
adjacency_list = defaultdict(list)

# Simulate connections using a normal distribution
np.random.seed(42)
connections_per_person = np.random.normal(loc=5, scale=2, size=num_people).astype(int)
connections_per_person = np.clip(connections_per_person, 1, num_people - 1)  # Ensure valid number of connections

# Generate adjacency lists
for i, person in data.iterrows():
    connections = np.random.choice(range(num_people), size=connections_per_person[i], replace=False)
    for connection in connections:
        if connection != i:  # Avoid self-loops
            adjacency_list[person['LinkedIn Name']].append(data.iloc[connection]['LinkedIn Name'])

# Displaying a sample of the summarized corpus and adjacency list
sample_corpus = data[['LinkedIn Name', 'Corpus']].head(3)
sample_adjacency_list = {k: adjacency_list[k] for k in list(adjacency_list.keys())[:3]}

print("Sample Summarized Corpus:")
print(sample_corpus)

print("\nSample Adjacency List:")
for name, connections in sample_adjacency_list.items():
    print(f"{name} -> {', '.join(connections)}")


In [None]:
# Generate adjacency list from generated summary corpus

from collections import defaultdict

# Load the pre-summarized corpus
file_path = '/content/drive/MyDrive/BTech_Project/summarized_corpus.csv'
data = pd.read_csv(file_path)

# Number of people (nodes)
num_people = len(data)

# Adjacency list to store connections
adjacency_list = defaultdict(list)

# Simulate connections using a normal distribution (15-20 connections per person)
np.random.seed(42)
connections_per_person = np.random.normal(loc=17.5, scale=2, size=num_people).astype(int)
connections_per_person = np.clip(connections_per_person, 15, 20)  # Ensure between 15 and 20 connections

# Generate adjacency lists
for i, person in data.iterrows():
    connections = np.random.choice(range(num_people), size=connections_per_person[i], replace=False)
    for connection in connections:
        if connection != i:  # Avoid self-loops
            adjacency_list[person['LinkedIn Name']].append(data.iloc[connection]['LinkedIn Name'])

# Display a sample of the adjacency list
sample_adjacency_list = {k: adjacency_list[k] for k in list(adjacency_list.keys())[:3]}

print("Sample Adjacency List:")
for name, connections in sample_adjacency_list.items():
    print(f"{name} -> {', '.join(connections)}")

# Save the adjacency list to a file (if needed)
# For example, saving as JSON for further analysis
import json
with open('/content/drive/MyDrive/BTech_Project/adjacency_list.json', 'w') as f:
    json.dump(adjacency_list, f)


Sample Adjacency List:
Saurabh Gupta -> Rachna Sharma, Prabakaran Pandian, Pawan Sut Sharma, Aayush Jha, Denis CA de Souza, Kartikay Garg, Aayush Garg, Kapil Kumar Narula, Ratnadeep Pawar, Chhaya Bhanti, Urv Patel, ANIL KUMAR SAMINENI, Mahadeva swamy, Karan Vyas, Shadab Ghazaly, Nitesh Singh, Channa Ghosh, chinmay Khanolkar
Jatin Singh -> Jacob Lallawmsang, Suchit Dekivadia, Renuka Nair, Elango Sidhan, Naveen Verma, Vipin Kumar Yadav, Mahadeva swamy, Saiprasad Bhartu, Balaram Puttaiah, Rajpal Navalkar, Wilma Rodrigues, Rajat Parikh, Cherish Tota, Gurjot Singh, Dr.Rathin Sharma, Jayavardhan Shetty, Akash Kumar
Nilesh Bhattad -> Jacob Lallawmsang, Narendra Patel, Ishant Sharma, Amit Saha, Anju Sasikumar, Dr Dnyaneshwar Battalwar, Shekar Prabhakar, Abde Ali Shabbir. ., DEEPAK KUMAR PANI, Suchit Dekivadia, Divyesh Chandera, Aayush Garg, Roopesh Rai, Sekhar C, Jayavardhan Shetty, Param Desai, Krishnan Komandur, Shibabrata Bhattacharjee


In [None]:
# Generate adjacency list with profession

import pandas as pd
import numpy as np
from collections import defaultdict
import json

# Load the data for names and professions
file_path = '/content/drive/MyDrive/BTech_Project/data.xlsx'  # Replace with your Excel file path
profession_data = pd.read_excel(file_path)

# Load the pre-summarized corpus for network
summarized_corpus_path = '/content/drive/MyDrive/BTech_Project/summarized_corpus.csv'  # Replace with your corpus file path
data = pd.read_csv(summarized_corpus_path)

# Number of people (nodes)
num_people = len(data)

# Create a dictionary for professions (to map name to profession)
name_to_profession = dict(zip(profession_data['LinkedIn Name'], profession_data['Description']))

# Adjacency list to store connections with professions
adjacency_list = defaultdict(list)

# Simulate connections using a normal distribution (15-20 connections per person)
np.random.seed(42)
connections_per_person = np.random.normal(loc=17.5, scale=2, size=num_people).astype(int)
connections_per_person = np.clip(connections_per_person, 15, 20)  # Ensure between 15 and 20 connections

# Generate adjacency lists with names and professions
for i, person in data.iterrows():
    connections = np.random.choice(range(num_people), size=connections_per_person[i], replace=False)
    for connection in connections:
        if connection != i:  # Avoid self-loops
            connected_person_name = data.iloc[connection]['LinkedIn Name']
            connected_person_profession = name_to_profession.get(connected_person_name, 'Unknown')
            adjacency_list[person['LinkedIn Name']].append((connected_person_name, connected_person_profession))

# Save the adjacency list to a file (JSON format)
adjacency_list_file = '/content/drive/MyDrive/BTech_Project/adjacency_list.json'  # Replace with your desired path
with open(adjacency_list_file, 'w') as f:
    json.dump(adjacency_list, f)

print(f"Adjacency list saved to {adjacency_list_file}")


Adjacency list saved to /content/drive/MyDrive/BTech_Project/adjacency_list.json


In [None]:
# DFS

def dfs_with_path(start, profession, visited=None, path=None):
    if visited is None:
        visited = set()
    if path is None:
        path = [start]

    visited.add(start)

    # Check if the current person has the required profession
    if name_to_profession.get(start, '') == profession:
        return path  # Return the path to the person with the required profession

    # Perform DFS on the neighbors
    for neighbor, _ in adjacency_list.get(start, []):
        if neighbor not in visited:
            result = dfs_with_path(neighbor, profession, visited, path + [neighbor])
            if result:
                return result  # Return the path as soon as a person with the required profession is found

    return None  # No person with the required profession found in the network


Saurabh Gupta -> Rachna Sharma -> Rain Ramesh Babu


In [None]:
# BFS

from collections import deque

def bfs_with_path(start, profession):
    # Queue for BFS (stores (name, path))
    queue = deque([(start, [start])])
    visited = set()
    visited.add(start)

    while queue:
        current, path = queue.popleft()

        # Check if the current person has the required profession
        if name_to_profession.get(current, '') == profession:
            return path  # Return the path to the person with the required profession

        # Explore the neighbors (connections)
        for neighbor, _ in adjacency_list.get(current, []):
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append((neighbor, path + [neighbor]))

    return None  # No person with the required profession found


Saurabh Gupta -> Kartikay Garg


In [None]:
# Example usage
starting_name = "Saurabh Gupta"
required_profession = "Chief Executive Officer"

# path = dfs_with_path(starting_name, required_profession)
path = bfs_with_path(starting_name, required_profession)

if path:
    print(" -> ".join(path))  # Print the path
else:
    print(f"No person with the profession {required_profession} found in the network starting from {starting_name}.")


Saurabh Gupta -> Kartikay Garg


In [None]:
# Example usage
starting_name = "Saurabh Gupta"
required_profession = "Chief Executive Officer"

# path = dfs_with_path(starting_name, required_profession)
path = bfs_with_path(starting_name, required_profession)

if path:
    print(" -> ".join(path))  # Print the path

    # Extract the last person (the one with the required profession) from the path
    selected_person = path[-1]

    # Load the summarized corpus data
    file_path = '/content/drive/MyDrive/BTech_Project/summarized_corpus.csv'  # Path to the summarized corpus file
    with open(file_path, 'r') as f:
        corpus_data = f.readlines()

    # Find the row with the selected person
    person_data = None
    for line in corpus_data:
        name, data = line.split(",", 1)  # Split the line by the first comma
        if name.strip() == selected_person:  # Compare the name with the selected person
            person_data = data.strip()  # Get the corpus data (remove leading/trailing spaces)
            break

    if person_data:
        print("\nCorpus data for selected person:")
        print(person_data)
    else:
        print(f"No corpus data found for {selected_person}")
else:
    print(f"No person with the profession {required_profession} found in the network starting from {starting_name}.")


Saurabh Gupta -> Kartikay Garg

Corpus data for selected person:
"Kartikay Garg is the Chief Executive Officer of Recycle City in India. His background includes nan, nanotechnology and recycling. He is a graduate of the Indian Institute of Technology, Kharagpur. He has worked in the recycling industry for more than a decade."
