In [3]:
import pandas as pd
import joblib
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from scipy.sparse import save_npz

print("--- Starting Model Pre-computation ---")

# --- 1. Load Cleaned Data ---
print("Loading datasets...")
# Using the cleaned candidates file you created
candidates_df = pd.read_csv('candidates_cleaned.csv')
internships_df = pd.read_csv('internships.csv').fillna('') # Fill empty cells

# --- 2. Fit and Save TF-IDF Vectorizer ---
print("Fitting TF-IDF Vectorizer...")
# Create a shared corpus of all skills to ensure a consistent vocabulary
corpus = pd.concat([candidates_df['skills_text'], internships_df['skills']], ignore_index=True)

# Initialize and fit the vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english')
tfidf_vectorizer.fit(corpus)

# Save the fitted vectorizer object
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("✅ TF-IDF Vectorizer saved to 'tfidf_vectorizer.pkl'")


# --- 3. Transform and Save Internship TF-IDF Vectors ---
print("\nTransforming internship skills with TF-IDF...")
# Use the fitted vectorizer to transform the internship skills into a sparse matrix
internship_tfidf_vectors = tfidf_vectorizer.transform(internships_df['skills'])

# Save the sparse matrix efficiently to prevent memory issues
save_npz('internship_tfidf_vectors.npz', internship_tfidf_vectors)
print("✅ Internship TF-IDF vectors saved to 'internship_tfidf_vectors.npz'")


# --- 4. Pre-compute and Save Internship Embeddings ---
print("\nLoading Sentence Transformer model...")
# Load the pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

print("Encoding internship descriptions...")
# Use the description column to create semantic embeddings
# Based on your file, the column is 'if skills orperksmissiongusethis'
description_column = 'ifSkillsorPerksMissingUseThis' # Make sure this column name is correct
internship_embeddings = model.encode(internships_df[description_column].tolist())

# Save the embeddings to a file
np.save('internship_embeddings.npy', internship_embeddings)
print("✅ Internship embeddings saved to 'internship_embeddings.npy'")

print("\n--- Pre-computation Complete! ---")

--- Starting Model Pre-computation ---
Loading datasets...
Fitting TF-IDF Vectorizer...
✅ TF-IDF Vectorizer saved to 'tfidf_vectorizer.pkl'

Transforming internship skills with TF-IDF...
✅ Internship TF-IDF vectors saved to 'internship_tfidf_vectors.npz'

Loading Sentence Transformer model...
Encoding internship descriptions...
✅ Internship embeddings saved to 'internship_embeddings.npy'

--- Pre-computation Complete! ---
