In [None]:
import pandas as pd
import numpy as np
import spacy
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
from collections import defaultdict

# Load spaCy English language model for NLP processing
nlp = spacy.load('en_core_web_sm')


bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
print("done")
# Load the Excel file
file_path = './data.xlsx'
data = pd.read_excel(file_path)
data = data.head(500)

# Function to clean and prepare text for summarization
def prepare_text(row):
    text = f"""
    {row['LinkedIn Name']} is currently working as {row['Description']} at {row['Organisation']}. 
    Based in {row['Location']}, they are part of the {row['Industry']} industry.
    In their current role as {row['Current Role(s)']}, they have been with the company for {row['Tenure at Company']}. 
    Their background includes: {row['About']}.
    """
    # Clean the text using NLP (removing stop words, punctuation, etc.)
    doc = nlp(text)
    cleaned_text = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct])
    return cleaned_text

# Function to generate a detailed and advanced summarized corpus using BART
def generate_bart_summary(text):
    inputs = bart_tokenizer(text, max_length=1024, return_tensors="pt", truncation=True)
    summary_ids = bart_model.generate(inputs["input_ids"], num_beams=4, max_length=150, early_stopping=True)
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to create a summarized corpus for each person
def generate_summarized_corpus(row):
    text = prepare_text(row)
    # Using BART to summarize the cleaned text
    summary = generate_bart_summary(text)
    return summary

# Create a summarized corpus for each person in the dataset
data['Corpus'] = data.apply(generate_summarized_corpus, axis=1)

# Number of people (nodes)
num_people = len(data)

# Adjacency list to store connections
adjacency_list = defaultdict(list)

# Simulate connections using a normal distribution
np.random.seed(42)
connections_per_person = np.random.normal(loc=5, scale=2, size=num_people).astype(int)
connections_per_person = np.clip(connections_per_person, 1, num_people - 1)  # Ensure valid number of connections

# Generate adjacency lists
for i, person in data.iterrows():
    connections = np.random.choice(range(num_people), size=connections_per_person[i], replace=False)
    for connection in connections:
        if connection != i:  # Avoid self-loops
            adjacency_list[person['LinkedIn Name']].append(data.iloc[connection]['LinkedIn Name'])

# Displaying a sample of the summarized corpus and adjacency list
sample_corpus = data[['LinkedIn Name', 'Corpus']].head(3)
sample_adjacency_list = {k: adjacency_list[k] for k in list(adjacency_list.keys())[:3]}

print("Sample Summarized Corpus:")
print(sample_corpus)

print("\nSample Adjacency List:")
for name, connections in sample_adjacency_list.items():
    print(f"{name} -> {', '.join(connections)}")


In [None]:
import json

# Save summarized corpus to a CSV file
corpus_file_path = './summarized_corpus.csv'
data[['LinkedIn Name', 'Corpus']].to_csv(corpus_file_path, index=False)

# Save adjacency list to a JSON file
adjacency_list_file_path = './adjacency_list.json'
with open(adjacency_list_file_path, 'w') as f:
    json.dump(adjacency_list, f, indent=4)

print(f"Summarized corpus saved to: {corpus_file_path}")
print(f"Adjacency list saved to: {adjacency_list_file_path}")


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import requests
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import os
os.environ["USE_TF"] = "0"

# def generate_why_and_how_explanation(person, skill, query):
#     """
#     Use GPT to generate a detailed explanation of why and how the person is relevant.
#     """
#     prompt = (
#         f"Given the following context, generate a detailed explanation of why and how this person can help:\n\n"
#         f"Query: {query}\n"
#         f"Skill: {skill}\n"
#         f"Person's Description: {person['Why They Can Help']}\n"
#         f"Additional Context: This person has connections to {', '.join(person.get('Connections', []))}.\n"
#         f"Focus on both why they are suitable and how they can provide practical help for the query."
#     )
#     response = openai.Completion.create(
#         engine="text-davinci-003",  # You can use 'gpt-4' if available
#         prompt=prompt,
#         max_tokens=150,
#         temperature=0.7
#     )
#     return response['choices'][0]['text'].strip()

# Load pre-trained T5 model and tokenizer locally
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")  # You can use "t5-large" for better results
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
corpus_data = pd.read_csv('./summarized_corpus.csv')
with open('./adjacency_list.json', 'r') as file:
    adjacency_list = json.load(file)

def extract_skills_from_prompt(prompt):
    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key=AIzaSyAZgqB8nFlcLYbs8ybg6p5jJCt7uf405iE"
    headers = {
        "Content-Type": "application/json"
    }
    payload = {
        "contents": [
            {
                "parts": [
                    {"text": prompt}
                ]
            }
        ]
    }

    response = requests.post(url, json=payload, headers=headers)
    
    if response.status_code == 200:
        result = response.json()
        extracted_text = result['candidates'][0]['content']['parts'][0]['text']
        return extracted_text
    else:
        print(f"Error: {response.status_code}")
        return ""
def get_sbert_embedding(text):
    return model.encode(text)

def t5_semantic_similarity(query, document):
    input_text = f"mnli premise: {query} hypothesis: {document}"  # NLI format
    inputs = t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = t5_model.generate(**inputs, max_length=3)
    score = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "entailment" in score.lower():
        return 1.0
    elif "neutral" in score.lower():
        return 0.5
    else:
        return 0.0

def generate_why_and_how_explanation_local(person, skill, query):
    """
    Use T5 locally to generate a detailed explanation of why and how this person can help.
    """
    # Define the input prompt for T5
    input_text = (
        f"Generate a detailed explanation of why and how the following person can help:\n"
        f"Query: {query}\n"
        f"Skill: {skill}\n"
        f"Person's Description: {person['Why They Can Help']}\n"
        f"Additional Context: This person has connections to {', '.join(person.get('Connections', []))}.\n"
    )
    
    # Tokenize and encode the input
    inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the explanation
    outputs = t5_model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    
    # Decode and return the generated text
    return t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

def find_relevant_people_sbert_t5_v3(extracted_text, corpus_data, adjacency_list, query):
    skills = extracted_text.lower().split()  
    skills = list(set(skills))  
    skill_embeddings = model.encode(skills)  
    relevant_people = []

    for idx, row in corpus_data.iterrows():
        corpus_text = row['Corpus']
        corpus_embedding = get_sbert_embedding(corpus_text)  

        for skill, skill_emb in zip(skills, skill_embeddings):
            cosine_score = cosine_similarity([skill_emb], [corpus_embedding])[0][0]
            t5_score = t5_semantic_similarity(skill, corpus_text)
            combined_score = (0.6 * cosine_score) + (0.4 * t5_score)

            if combined_score > 0.4:  
                person_info = {
                    'Name': row['LinkedIn Name'],
                    'Matching Skills': [skill],
                    'Cosine Similarity': cosine_score,
                    'T5 Semantic Similarity': t5_score,
                    'Combined Similarity Score': combined_score,
                    'Why They Can Help': f"{row['LinkedIn Name']} has experience in {skill}.",
                    'Connections': adjacency_list.get(row['LinkedIn Name'], [])
                }
                # Generate a detailed "Why and How They Can Help"
                detailed_explanation = generate_why_and_how_explanation_local(person_info, skill, query)
                person_info['Why and How They Can Help'] = detailed_explanation
                relevant_people.append(person_info)

    relevant_people = sorted(relevant_people, key=lambda x: x['Combined Similarity Score'], reverse=True)
    return relevant_people

# Example usage
prompt = "I need a guy who can help me in building a greenhouse. Give me in a single paragraph the environmental technical skills the person should have."

extracted_text = extract_skills_from_prompt(prompt)
print(f"Extracted text: {extracted_text}")

relevant_people = find_relevant_people_sbert_t5_v3(extracted_text, corpus_data, adjacency_list, prompt)

for person in relevant_people:
    print(f"Name: {person['Name']}")
    print(f"Matching Skills: {', '.join(person['Matching Skills'])}")
    print(f"Combined Similarity Score: {person['Combined Similarity Score']:.2f}")
    print(f"Why and How They Can Help: {person['Why and How They Can Help']}")
    print(f"Connections: {', '.join(person['Connections'])}")
    print()
