In [None]:
import ast
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import re
from collections import Counter
import pandas as pd
import textwrap
import seaborn as sns

In [None]:
entity_match_results = pd.read_csv('data/clean_tables/profiles.csv') 

In [None]:
entity_match_results.head(5)

In [None]:
colors = ['#164863', '#427D9D', '#9BBEC8', '#DDF2FD', '#F2EBE9', '#6B818C']


def plot_position_data2(df, position_title):
    def generate_summary_text(skills_counts, education_counts):
        top_skills_text = "\n".join([
                                        f"- {skill}, with {(count / len(skills_counts.items())) * 100} % of the {position_title}s skills we've collected"
                                        for skill, count in skills_counts.items()][:5])
        top_education_text = "\n".join(
            [f"- {education}: {(count / len(education_counts.items())) * 100} % of the education mentions" for
             education, count in education_counts.items()][:5])

        summary_text = f"For the position of {position_title}, the top skills are:\n{top_skills_text}\n\n"
        summary_text += f"Also, the top education levels are:\n{top_education_text}"
        summary_text = "Placeholder for future implementation :)"
        return summary_text

    def create_word_cloud(bag_of_words):
        custom_stopwords = set(STOPWORDS)
        custom_stopwords.update({"job", "title", "position", "at", "end", "of", "skills", "0", "1"})
        wordcloud = WordCloud(width=800, height=400, background_color='white',
                              stopwords={"job", "title", "position", "at", "end", "of", "skills", "0", "1"},
                              colormap='Blues', collocation_threshold=20).generate_from_frequencies(bag_of_words)
        plt.figure(figsize=(10, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f"Bag of Words - {position_title} Required Skills")
        plt.axis('off')
        plt.show()

    def create_bag_of_words(df):
        all_skills = ' '.join(df['skills'].values.astype(str))
        words = re.findall(r'\b[a-zA-Z]+\b', all_skills.lower())
        word_counts = Counter(words)
        return word_counts

    position_data = df[df['position'] == position_title]
    
    if 'education' in position_data.columns:
        education_counts = position_data['education'].iloc[0]
        sorted_education_counts = dict(sorted(education_counts.items(), key=lambda x: x[1], reverse=True))
        top_5_education = dict(list(sorted_education_counts.items())[:5])
        all_other_count = sum(sorted_education_counts.values()) - sum(top_5_education.values())
        combined_education_counts = {**top_5_education, 'All Other': all_other_count}

        plt.figure(figsize=(8, 6))
        plt.pie(combined_education_counts.values(), labels=combined_education_counts.keys(), autopct='%1.1f%%',
                startangle=140, colors=colors)
        plt.title(f"Educational Levels for {position_title}")
        plt.axis('equal')
        plt.show()

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    if 'skills' in position_data.columns:
        skills_counts = position_data['skills'].iloc[0]
        total_skills = sum(skills_counts.values())
        skills_percentages = {skill: (count / total_skills) * 100 for skill, count in
                              position_data['skills'].iloc[0].items()}
        skills_df = pd.DataFrame(list(skills_percentages.items()), columns=['skill', 'p'])
        skills_df = skills_df.sort_values(by='p', ascending=True)

        num_skills_to_display = min(15, len(skills_df))
        skills_df = skills_df.iloc[-num_skills_to_display:]

        axes[0].barh(skills_df['skill'], skills_df['p'], color='#A3816A', edgecolor='#0A065D', alpha=0.8)
        axes[0].set_xlabel('Percentage (%)')
        axes[0].set_ylabel('Skill')
        axes[0].set_title(f'Top Skills for {position_title} (%)')

        summary_text = generate_summary_text(skills_counts, education_counts)
        max_text_width = 50
        wrapped_text = textwrap.fill(summary_text, width=max_text_width)
        axes[1].axis('off')
        axes[1].text(0.5, 0.5, wrapped_text, fontsize=12, ha='center', va='center', fontfamily='Arial',
                     fontweight='bold', wrap=True)
        plt.show()
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
    if 'years_of_experience' in position_data.columns:
        years_of_experience = position_data['years_of_experience'].explode().reset_index()
        sns.kdeplot(data=years_of_experience,x= 'years_of_experience', ax=axes[0],palette=['#A3816A'])
        axes[0].set_title(f"Years of Experience for {position_title}")
        axes[0].set_xlabel("Years of Experience")
        axes[0].set_ylabel("Frequency")

        sns.boxplot(years_of_experience['years_of_experience'],ax=axes[1], vert=True,linewidth=1.5,saturation=0.8,medianprops={"color": "black", "linewidth": 1 ,"linestyle":'--'})
        axes[1].set_title(f"Years of Experience for {position_title}")
        axes[1].set_xlabel("Years of Experience")

        plt.tight_layout()
        plt.show()

    bag_of_words = create_bag_of_words(position_data)
    create_word_cloud(bag_of_words)

In [None]:
user_input = 'Accountant'
pdf = entity_match_results[entity_match_results['position'] == user_input]
plot_position_data2(entity_match_results, user_input)