In [1]:
import pickle


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load the dataset
filename = 'job.csv'
data = pd.read_csv(filename)

In [4]:

# Clean job descriptions
data['job_description'] = data['job_description'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)


KeyError: 'job_description'

In [5]:

# Clean job descriptions
data['Job Description'] = data['Job Description'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)


In [6]:
# Encode job titles into numeric labels
label_encoder = LabelEncoder()
data['Job Title'] = label_encoder.fit_transform(data['Job Title'])

In [7]:
# Split into features and labels
X = data['Job Description']
y = data['Job Title']

In [8]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Vectorize the job descriptions
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train).toarray()
X_test_vectorized = vectorizer.transform(X_test).toarray()

In [10]:
import numpy as np

def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1 - row2) ** 2))

In [11]:
def get_neighbors(X_train, y_train, test_row, num_neighbors):
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(test_row, X_train[i])
        distances.append((y_train[i], dist))
    
    # Sort by distance and return the top k neighbors
    distances.sort(key=lambda x: x[1])
    neighbors = distances[:num_neighbors]
    return [neighbor[0] for neighbor in neighbors]

In [12]:
def predict(X_train, y_train, test_row, num_neighbors):
    neighbors = get_neighbors(X_train, y_train, test_row, num_neighbors)
    # Majority vote
    prediction = max(set(neighbors), key=neighbors.count)
    return prediction

In [13]:
# Initialize parameters
num_neighbors = 5
predictions = []

In [14]:
# Make predictions for each test instance
for test_row in X_test_vectorized:
    predicted_label = predict(X_train_vectorized, y_train, test_row, num_neighbors)
    predictions.append(predicted_label)

KeyError: 0

In [15]:
def get_neighbors(X_train, y_train, test_row, num_neighbors):
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(test_row, X_train[i])  # Calculate distance
        distances.append((y_train.iloc[i], dist))  # Access y_train using iloc
    # Sort by distance and return the top k neighbors
    distances.sort(key=lambda x: x[1])
    neighbors = [distances[i][0] for i in range(num_neighbors)]
    return neighbors

In [16]:
def predict(X_train, y_train, test_row, num_neighbors):
    neighbors = get_neighbors(X_train, y_train, test_row, num_neighbors)
    # Majority vote
    prediction = max(set(neighbors), key=neighbors.count)
    return prediction

In [17]:
# Example usage in prediction
predictions = []
for test_row in X_test_vectorized:
    predicted_label = predict(X_train_vectorized, y_train, test_row, num_neighbors)
    predictions.append(predicted_label)

In [18]:
# Define the model parameters to save
model_data = {
    'vectorizer': vectorizer,
    'label_encoder': label_encoder,
    'X_train': X_train_vectorized,
    'y_train': y_train
}

In [19]:
# Save the model to a .pk file
with open('knn_model.pk', 'wb') as file:
    pickle.dump(model_data, file)

In [20]:
print("Model saved successfully as knn_model.pk")


Model saved successfully as knn_model.pk


In [21]:
# Load the model from the .pk file
with open('knn_model.pk', 'rb') as file:
    loaded_model = pickle.load(file)

In [22]:
# Accessing the loaded components
loaded_vectorizer = loaded_model['vectorizer']
loaded_label_encoder = loaded_model['label_encoder']
loaded_X_train = loaded_model['X_train']
loaded_y_train = loaded_model['y_train']

print("Model loaded successfully.")

Model loaded successfully.


In [23]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np

In [24]:

# Step 1: Load the dataset
data = pd.read_csv('job.csv')  # Replace with your actual CSV file

In [25]:
# Step 2: Clean job descriptions
data['Job Description'] = data['Job Description'].str.lower().str.replace(r'[^a-z\s]', '', regex=True)


In [26]:
# Step 3: Vectorize job descriptions
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Job Description']).toarray()  # Convert to a NumPy array
y = data['Job Title']  # Target variable

In [27]:
# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
# Step 5: Define the custom k-NN functions
def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1 - row2) ** 2))

def get_neighbors(X_train, y_train, test_row, num_neighbors):
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(test_row, X_train[i])
        distances.append((y_train.iloc[i], dist))
    distances.sort(key=lambda x: x[1])
    neighbors = [distances[i][0] for i in range(num_neighbors)]
    return neighbors

def predict(X_train, y_train, test_row, num_neighbors):
    neighbors = get_neighbors(X_train, y_train, test_row, num_neighbors)
    prediction = max(set(neighbors), key=neighbors.count)
    return prediction

In [29]:
# Step 6: Train the k-NN model and make predictions
num_neighbors = 5
predictions = []
for test_row in X_test:
    predicted_label = predict(X_train, y_train, test_row, num_neighbors)
    predictions.append(predicted_label)

In [30]:
# Step 7: Save the model and vectorizer to pickle files
with open('model.pk', 'wb') as model_file:
    pickle.dump((X_train, y_train), model_file)  # Save training data for prediction use

In [31]:
with open('vectorizer.pk', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)  # Save the vectorizer

In [32]:
print("Model and vectorizer have been saved successfully.")

Model and vectorizer have been saved successfully.
