# Market Trend Analysis - ML-Powered Job Market Segmentation



## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Deep Learning
from sentence_transformers import SentenceTransformer
import torch

# Machine Learning
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print(" All libraries imported successfully")

## 2. Load Job Data

In [None]:
# Load processed job data
jobs_path = Path("../data/processed/all_jobs_master.csv")
jobs_df = pd.read_csv(jobs_path)

print(f"ðŸ“Š Loaded {len(jobs_df)} jobs")
print(f"\nColumns: {jobs_df.columns.tolist()}")

# Display sample
jobs_df.head()

## 3. Deep Learning: Generate SBERT Embeddings

**SBERT (Sentence-BERT)** converts text into 384-dimensional vectors that capture semantic meaning.

### Why SBERT?
- Pre-trained on 1 billion sentence pairs
- Captures semantic similarity ("Software Engineer" â‰ˆ "Developer")
- Fast inference (~50ms per job)

In [None]:
# Load pre-trained SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

print(f"Model: {model}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

In [None]:
# Prepare text: Combine title + description snippet
jobs_df['combined_text'] = (
    jobs_df['title'].fillna('') + " " + 
    jobs_df['description'].fillna('').str[:200]
)

print("Sample combined text:")
print(jobs_df['combined_text'].iloc[0][:200])

In [None]:
# Generate embeddings (this takes ~30 seconds)
print("Encoding jobs into 384-dimensional vectors...")

embeddings = model.encode(
    jobs_df['combined_text'].tolist(),
    show_progress_bar=True,
    convert_to_tensor=False
)

print(f"\nâœ… Generated embeddings: {embeddings.shape}")
print(f"   {embeddings.shape[0]} jobs Ã— {embeddings.shape[1]} dimensions")

## 4. Machine Learning: K-Means Clustering

**K-Means** groups similar jobs into market segments.

### How it works:
1. Randomly initialize K cluster centers
2. Assign each job to nearest center
3. Update centers to mean of assigned jobs
4. Repeat until convergence

In [None]:
# Find optimal number of clusters using Elbow Method
inertias = []
K_range = range(5, 21)

print("Testing different cluster counts...")
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(embeddings)
    inertias.append(kmeans.inertia_)
    print(f"K={k}: inertia={kmeans.inertia_:.2f}")

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Within-cluster sum of squares)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

In [None]:
# Run K-Means with optimal K (let's use 10)
n_clusters = 10

print(f"Running K-Means with {n_clusters} clusters...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
jobs_df['cluster'] = kmeans.fit_predict(embeddings)

print(f"\n Clustering complete!")
print(f"\nCluster distribution:")
print(jobs_df['cluster'].value_counts().sort_index())

## 5. Visualize Clusters (2D Projection)

Use **PCA** to reduce 384 dimensions â†’ 2 dimensions for visualization

In [None]:
# Reduce dimensions for visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

print(f"Explained variance: {pca.explained_variance_ratio_}")
print(f"Total variance captured: {pca.explained_variance_ratio_.sum():.2%}")

In [None]:

plt.figure(figsize=(14, 10))
scatter = plt.scatter(
    embeddings_2d[:, 0],
    embeddings_2d[:, 1],
    c=jobs_df['cluster'],
    cmap='tab10',
    alpha=0.6,
    s=50
)

plt.colorbar(scatter, label='Cluster ID')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Job Market Clusters (PCA Projection)')
plt.grid(True, alpha=0.3)
plt.show()

## 6. NLP: Extract Keywords with TF-IDF

**TF-IDF** (Term Frequency-Inverse Document Frequency) identifies important words in each cluster.

In [None]:
# Analyze each cluster
trend_summary = []

for i in range(n_clusters):
    cluster_data = jobs_df[jobs_df['cluster'] == i]
    
    # TF-IDF keyword extraction
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
    tfidf_matrix = vectorizer.fit_transform(cluster_data['combined_text'].fillna(''))
    keywords = vectorizer.get_feature_names_out()
    
    # Most common job titles
    common_titles = cluster_data['title'].value_counts().head(3).index.tolist()
    
    trend_summary.append({
        "cluster_id": i,
        "size": len(cluster_data),
        "top_titles": common_titles,
        "key_skills": keywords.tolist(),
        "market_share": round((len(cluster_data) / len(jobs_df)) * 100, 2)
    })

# Convert to DataFrame for better display
trends_df = pd.DataFrame(trend_summary)
trends_df

In [None]:
# Print detailed report
print("="*60)
print("SRI LANKAN IT MARKET TREND REPORT (ML-POWERED)")
print("="*60)

for trend in trend_summary:
    print(f"\n Cluster #{trend['cluster_id']} ({trend['market_share']}% of Market)")
    print(f" Size: {trend['size']} jobs")
    print(f" Typical Roles: {', '.join(trend['top_titles'])}")
    print(f" Primary Skills: {', '.join(trend['key_skills'])}")

## 7. Hot Skills Analysis

In [None]:
# Extract all skills from jobs
all_skills = []

if 'extracted_skills' in jobs_df.columns:
    for skills in jobs_df['extracted_skills'].dropna():
        all_skills.extend([s.strip().lower() for s in str(skills).split(",") if s.strip()])

# Count skill occurrences
skill_counts = pd.Series(all_skills).value_counts()

print(f"\n{'='*60}")
print("TOP 20 HOT SKILLS IN SRI LANKA")
print(f"{'='*60}")

for skill, count in skill_counts.head(20).items():
    print(f"- {skill.upper()}: found in {count} jobs")

In [None]:
# Visualize hot skills
plt.figure(figsize=(12, 8))
top_skills = skill_counts.head(15)

plt.barh(range(len(top_skills)), top_skills.values, color='steelblue')
plt.yticks(range(len(top_skills)), [s.upper() for s in top_skills.index])
plt.xlabel('Number of Jobs')
plt.title('Top 15 Most In-Demand Skills in Sri Lanka', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)

# Add value labels
for i, v in enumerate(top_skills.values):
    plt.text(v + 5, i, str(v), va='center')

plt.tight_layout()
plt.show()

## 8. Market Share Visualization

In [None]:
# Pie chart of market segments
plt.figure(figsize=(12, 8))

cluster_sizes = jobs_df['cluster'].value_counts().sort_index()
labels = [f"Segment {i}\n({trends_df.iloc[i]['top_titles'][0]})" for i in range(n_clusters)]

plt.pie(
    cluster_sizes.values,
    labels=labels,
    autopct='%1.1f%%',
    startangle=90,
    textprops={'fontsize': 9}
)

plt.title('Sri Lankan IT Job Market Distribution', fontsize=14, fontweight='bold')
plt.axis('equal')
plt.show()

## 9. Export Results

In [None]:

output_path = Path("../data/processed/jobs_with_clusters.csv")
jobs_df.to_csv(output_path, index=False)
print(f" Saved clustered jobs to: {output_path}")


trends_output = Path("../data/processed/market_trends.csv")
trends_df.to_csv(trends_output, index=False)
print(f" Saved trend summary to: {trends_output}")