In [None]:
import os
import zipfile
import chardet
from PyPDF2 import PdfReader
import pandas as pd
import json
import re
from groq import Groq
import time
import random
from groq import RateLimitError

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to extract PDFs from zipfile, convert them to text, and save as .txt files
def extract_and_save_text_from_zip(zip_file_path, extract_to, output_zip_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    with zipfile.ZipFile(output_zip_path, 'w') as output_zip:
        for root, dirs, files in os.walk(extract_to):
            for file in files:
                if file.endswith(".pdf"):
                    pdf_path = os.path.join(root, file)
                    text = extract_text_from_pdf(pdf_path)

                    text_filename = os.path.splitext(file)[0] + ".txt"
                    text_filepath = os.path.join(extract_to, text_filename)
                    with open(text_filepath, 'w', encoding='utf-8') as text_file:
                        text_file.write(text)

                    output_zip.write(text_filepath, arcname=text_filename)

    print(f"Text files saved and zipped as {output_zip_path}")

# Function to detect file encoding
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

# Function for sentiment calculation using Groq API
def sentiment_calculator(text_path):
    with open(text_path, 'r', encoding='Windows-1252') as f:
        text = f.read()

    client = Groq(api_key= your_api_key)

    completion = client.chat.completions.create(
        model="llama-3.1-70b-versatile",
        messages=[
            {
                "role": "system",
                "content": """
                You are provided a piece of text that contains various claims, both normal and exaggerated. Your task is to:
                  1. Identify all the claims in the text.
                  2. For normal claims, assign a sentiment score between 0 and 1.
                  3. For exaggerated claims, assign a sentiment score between 0 and 1.
                  4. Calculate the total sum of sentiment scores for normal claims and exaggerated claims separately.
                  5. Output the result as an integer, calculated using the formula:
                    (Average of normal sentiment scores) - 0.1 * (Average of exaggerated sentiment scores)

                  DO NOT Provide any intermediate steps in the response.

                  The final output should ONLY be the result of this formula.
                  """
            },
            {
                "role": "user",
                "content": f"""The text is provided below:\n {text}

                DO NOT Provide any intermediate steps in the response.

                The final output should ONLY be the result of the formula mentioned above.
                """
            }
        ],
        temperature=0.5,
        max_tokens=4096,
        top_p=1,
        stream=False,
        stop=None,
    )

    return completion.choices[0].message.content
            

# Function to extract IDs from file paths
def extract_ids(file_path):
    recommendee_id = re.search(r'Recommendation_Letters_of_ID_(\d+)', file_path).group(1)
    recommender_id = re.search(r'Recommendation_From_ID_(\d+)', file_path).group(1)
    return recommendee_id, recommender_id

# Main execution
if __name__ == "__main__":
    # Extract text from resumes
    zip_file_path = "Final_Resumes.zip"
    extract_to = "Files/Final_Resumes_Text"
    output_zip_path = "Files/extracted_text_files.zip"
    extract_and_save_text_from_zip(zip_file_path, extract_to, output_zip_path)

    # Process recommendation letters
    recommendation_dir = 'Final_Recommendation_Letters'
    sentiment_scores = {}

    for root, dirs, files in os.walk(recommendation_dir):
        for file in files[:10]:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                file_encoding = detect_encoding(file_path)
                sentiment_score = sentiment_calculator(file_path)
                recommendee_id, recommender_id = extract_ids(file_path)
                sentiment_scores[(recommendee_id, recommender_id)] = float(sentiment_score)

    # Load and process skills data
    df_lor_skills = pd.read_csv('innov8/LOR_skills.csv')
    df_lor_skills[['Recommendee_ID', 'Recommender_ID']] = df_lor_skills['File'].apply(lambda x: pd.Series(extract_ids(x)))
    df_lor_skills = df_lor_skills.drop('File', axis=1)
    df_lor_skills = df_lor_skills.sort_values('Recommendee_ID')

    # Load and process resume data
    with open('innov8/all_resumes_data.json') as f:
        resume_data = json.load(f)

    df_resume = pd.DataFrame(resume_data)
    df_resume['Recommendee_ID'] = df_resume['File_Name'].apply(lambda x: re.search(r'Resume_of_ID_(\d+)', x).group(1))
    df_resume = df_resume[['Skills', 'Recommendee_ID']]
    df_resume = df_resume.sort_values('Recommendee_ID')

    # Merge sentiment scores with skills data
    df_lor_skills['Sentiment_Score'] = df_lor_skills.apply(lambda row: sentiment_scores.get((row['Recommendee_ID'], row['Recommender_ID']), 0), axis=1)

    # Save processed dataframes
    df_lor_skills.to_csv('Files/processed_lor_skills.csv', index=False)
    df_resume.to_csv('Files/processed_resume_skills.csv', index=False)

    print("Data preparation and initial processing completed.")

In [None]:
os.walk('Final_Recommendation_Letters')

In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

# Load RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

def get_embeddings(text):
    """Compute embeddings for given text using RoBERTa"""
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def compute_skill_similarity(resume_skills, rec_skills):
    """Compute cosine similarity between resume skills and recommendation skills"""
    embeddings1 = get_embeddings(resume_skills)
    embeddings2 = get_embeddings(rec_skills)
    return cosine_similarity(embeddings1, embeddings2)[0][0]

def process_skills(skills):
    """Process skills to ensure they are in the correct format"""
    if pd.notna(skills) and isinstance(skills, list) and skills:
        return skills[0]
    return ""

def calculate_average_skill_similarity(df_resume, df_lor):
    """Calculate average skill similarity scores for each resume"""
    average_scores = []

    for _, resume_row in df_resume.iterrows():
        resume_skills = process_skills(resume_row['Skills'])
        if not resume_skills:
            average_scores.append(0.0)
            continue

        recommendations = df_lor[df_lor['Recommendee_ID'] == resume_row['Recommendee_ID']]
        similarity_scores = []

        for _, rec_row in recommendations.iterrows():
            rec_skills = process_skills(rec_row['Skills'])
            if rec_skills:
                similarity = compute_skill_similarity(resume_skills, rec_skills)
                similarity_scores.append(similarity)

        avg_score = sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0.0
        average_scores.append(avg_score)

    return average_scores

def visualize_score_distribution(scores):
    """Visualize the distribution of skill similarity scores"""
    plt.figure(figsize=(10, 6))
    sns.histplot(scores, kde=True)
    plt.title('Distribution of Average Skill Similarity Scores')
    plt.xlabel('Average Skill Similarity Score')
    plt.ylabel('Frequency')
    plt.savefig('skill_similarity_distribution.png')
    plt.close()

def main():
    # Load processed data from Section 1
    df_resume = pd.read_csv('Files/processed_resume_skills.csv')
    df_lor = pd.read_csv('Files/processed_lor_skills.csv')

    print("Computing skill similarity scores...")
    average_scores = calculate_average_skill_similarity(df_resume, df_lor)

    # Add average similarity scores to df_resume
    df_resume['Average_Skill_Similarity'] = average_scores

    # Combine with existing scores (assuming 'Overall_Score' exists from previous analysis)
    if 'Overall_Score' in df_resume.columns:
        df_resume['Final_Score'] = 0.7 * df_resume['Overall_Score'] + 0.3 * df_resume['Average_Skill_Similarity']
    else:
        df_resume['Final_Score'] = df_resume['Average_Skill_Similarity']

    # Normalize Final_Score to 0-100 range
    df_resume['Final_Score'] = (df_resume['Final_Score'] - df_resume['Final_Score'].min()) / (df_resume['Final_Score'].max() - df_resume['Final_Score'].min()) * 100

    print("Visualizing score distribution...")
    visualize_score_distribution(df_resume['Final_Score'])

    # Save the final results
    df_resume.to_csv('Files/final_resume_scores.csv', index=False)
    print("Final scores saved to 'final_resume_scores.csv'")

    # Display top 10 candidates
    top_candidates = df_resume.nlargest(10, 'Final_Score')
    print("\nTop 10 Candidates:")
    print(top_candidates[['Recommendee_ID', 'Final_Score']])

    # Calculate and display summary statistics
    summary_stats = df_resume['Final_Score'].describe()
    print("\nSummary Statistics:")
    print(summary_stats)

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data from each level
df_level1 = pd.read_csv('innov8/final_ranked_resumes.csv')
df_level2 = pd.read_csv('innov8/final_credit_scores.csv')
df_level3 = pd.read_csv('innov8/df_resume_sorted.csv')


# Normalize scores to 0-100 range
def normalize_score(series):
    return 100 * (series - series.min()) / (series.max() - series.min())

df_combined = pd.DataFrame()

# df_combined['ID'] = 
df_combined['Normalized_Overall_Score'] = normalize_score(df_level1['Overall_Score'])
df_combined['Normalized_CreditScore'] = (df_level2['CreditScore'])
df_combined['Normalized_Skill_Similarity'] = normalize_score(df_level3['Average_Skill_Similarity'])

# Calculate final weighted score
df_combined['Final_CV_Score'] = (
    0.60 * df_combined['Normalized_Overall_Score'] +
    0.25 * df_combined['Normalized_CreditScore'] +
    0.15 * df_combined['Normalized_Skill_Similarity']
)

# Visualize distributions
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.histplot(df_combined['Normalized_Overall_Score'], kde=True)
plt.title('Level 1: Initial Resume Screening Score')

plt.subplot(2, 2, 2)
sns.histplot(df_combined['Normalized_CreditScore'], kde=True)
plt.title('Level 2: Network Analysis Score')

plt.subplot(2, 2, 1)
sns.histplot(df_combined['Normalized_Skill_Similarity'], kde=True)
plt.title('Level 3: Skill Similarity Score')

plt.subplot(2, 2, 2)
sns.histplot(df_combined['Final_CV_Score'], kde=True)
plt.title('Final Weighted CV Score')

plt.tight_layout()
plt.savefig('score_distributions.png')


# Save final results
df_combined.to_csv('Files/final_cv_scores.csv', index=False)

# Print summary statistics
print("\nSummary Statistics of Final CV Score:")
print(df_combined['Final_CV_Score'].describe())