In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Read original file
asean_level = pd.read_excel("ASEAN ICT Skills Standard.xlsx", sheet_name="Job with Level")
asean_non_level = pd.read_excel("ASEAN ICT Skills Standard.xlsx", sheet_name="Job no Level")
job = pd.read_excel("ALLmo.xlsx")

# Combine Column `Information` and `Description`
job.to_csv('combine_text.csv', index=False, encoding='utf-8-sig')
job.to_excel('combine_text.xlsx', index=False)

# Load Combined Data
job_combine = pd.read_csv("combine_text.csv")

# Clean text function
def clean_text(text):
    if text == 'text not found':
        return text

    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert text to lowercase

    stop_words = set(stopwords.words('english'))
    words = text.split()
    cleaned_words = [word for word in words if word not in stop_words]

    return ' '.join(cleaned_words)

# Apply cleaning to 'description' column
job_combine['description'] = job_combine['description'].apply(clean_text)

# Define the calculate_similarity function
def calculate_similarity(standard_jobs_df, new_jobs_df):
    # Combine job descriptions from both datasets
    all_jobs = standard_jobs_df['Level'].tolist() + new_jobs_df['description'].tolist()

    # Vectorize the job descriptions
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_jobs)

    # Separate the datasets by their counts
    standard_jobs_count = len(standard_jobs_df)
    new_jobs_count = len(new_jobs_df)

    # Calculate cosine similarity
    similarity_scores = cosine_similarity(tfidf_matrix[standard_jobs_count:], tfidf_matrix[:standard_jobs_count])

    # Normalize the scores to percentage
    normalized_scores = similarity_scores / np.sum(similarity_scores, axis=1)[:, np.newaxis] * 100

    return normalized_scores

# Calculate similarity between 'Level' in asean_level and 'description' in job_combine
similarities = calculate_similarity(asean_level, job_combine)

# Display the similarity scores
print(similarities)

# Create MultiIndex for result DataFrame
multi_index = pd.MultiIndex.from_arrays([job_combine['id'], job_combine['title'], job_combine['country'], job_combine['description']],
                                        names=['id', 'title', 'country', 'description'])

# Create the results DataFrame
result_df = pd.DataFrame(
    similarities,
    columns=asean_level['Job Group'],  # Adjust this to match your actual column name in asean_level
    index=multi_index
)

# Reset index to make 'Job name' and 'description' columns
result_df = result_df.reset_index()

# Save the results to an Excel file
result_df.to_excel("role_profile_match_V2.xlsx", index=False)

print("Result saved to 'role_profile_match_V2.xlsx'")


[nltk_data] Downloading package stopwords to /Users/guide/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'int' object has no attribute 'lower'