In [None]:
# 03-text-analysis.ipynb
# Purpose: Text mining on job descriptions and criteria

import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns

# Load cleaned data
df = pd.read_csv("cleaned_jobs.csv")

# Combine text fields for analysis
df['TEXT'] = df['TITLE'].fillna('') + ' ' + df['DESCRIPTION'].fillna('')

# --- WordCloud for Job Titles ---
title_text = ' '.join(df['TITLE'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(title_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud of Job Titles')
plt.tight_layout()
plt.savefig('visuals/wordcloud_titles.png')
plt.show()

# --- WordCloud for Job Descriptions ---
desc_text = ' '.join(df['DESCRIPTION'].dropna())
desc_wc = WordCloud(width=800, height=400, background_color='white').generate(desc_text)

plt.figure(figsize=(10, 5))
plt.imshow(desc_wc, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud of Job Descriptions')
plt.tight_layout()
plt.savefig('visuals/wordcloud_descriptions.png')
plt.show()

# --- TF-IDF to extract top skills ---
vectorizer = TfidfVectorizer(max_features=30, stop_words='english')
X = vectorizer.fit_transform(df['TEXT'])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_mean = tfidf_df.mean().sort_values(ascending=False)

plt.figure(figsize=(12,6))
sns.barplot(x=tfidf_mean.values, y=tfidf_mean.index, palette='coolwarm')
plt.title('Top TF-IDF Terms from Descriptions')
plt.xlabel('TF-IDF Score')
plt.ylabel('Skill / Term')
plt.tight_layout()
plt.savefig('visuals/top_tfidf_skills.png')
plt.show()

# --- Compare Remote vs Onsite Keyword Frequencies ---
remote_text = ' '.join(df[df['ONSITE REMOTE'].str.contains('remote', na=False)]['TEXT'])
onsite_text = ' '.join(df[df['ONSITE REMOTE'].str.contains('onsite|office', na=False)]['TEXT'])

remote_wc = WordCloud(width=800, height=400, background_color='white').generate(remote_text)
onsite_wc = WordCloud(width=800, height=400, background_color='white').generate(onsite_text)

fig, axs = plt.subplots(1, 2, figsize=(18, 6))
axs[0].imshow(remote_wc, interpolation='bilinear')
axs[0].axis('off')
axs[0].set_title('Remote Job Descriptions')

axs[1].imshow(onsite_wc, interpolation='bilinear')
axs[1].axis('off')
axs[1].set_title('Onsite Job Descriptions')

plt.tight_layout()
plt.savefig('visuals/remote_vs_onsite_wordclouds.png')
plt.show()
