In [31]:

import pandas as pd

# Path to your file
file_path = 'collection_jobs.csv'

# Load the CSV with '|' as the delimiter and handle bad lines by skipping them
try:
    data = pd.read_csv(file_path, delimiter='|', on_bad_lines='skip')
    print("Data loaded successfully.")

    # Data cleaning steps (example):
    # 1. Drop rows with missing values (if needed)
    data = data.dropna()

    # 2. Remove duplicates
    data = data.drop_duplicates()
    

    # 3. Additional cleaning steps based on column requirements
    # For example, trimming whitespace from string columns
    data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    # Saving the cleaned data to a new file for full viewing
    cleaned_file_path = 'cleaned_collection_jobs.csv'
    data.to_csv(cleaned_file_path, index=False)
    print(f"Cleaned data has been saved to '{cleaned_file_path}'.")

except Exception as e:
    print("An error occurred:", e)




Data loaded successfully.
Cleaned data has been saved to 'cleaned_collection_jobs.csv'.


In [33]:
import pandas as pd

# Load cleaned dataset
file_path = 'cleaned_collection_jobs.csv'
df = pd.read_csv(file_path)

# Ensure 'job_description' column is preprocessed
df['description'] = df['description'].fillna('')


In [35]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

!pip install sentence-transformers faiss-cpu




In [37]:
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for job descriptions
embeddings = model.encode(df['description'].tolist())


In [38]:
import faiss
import numpy as np

# Convert embeddings to a format suitable for FAISS
embedding_dim = embeddings.shape[1]  # Typically 384 for 'all-MiniLM-L6-v2'
index = faiss.IndexFlatL2(embedding_dim)  # Use L2 distance for similarity

# Add embeddings to the FAISS index
index.add(np.array(embeddings))
print(f"Indexed {index.ntotal} descriptions.")


Indexed 2278 descriptions.


In [39]:

def retrieve_jobs(query, model, index, top_n=5):
    # Convert query to embedding
    query_embedding = model.encode([query])
    
    # Search in FAISS index
    distances, indices = index.search(np.array(query_embedding), top_n)
    
    # Fetch the results
    results = df.iloc[indices[0]].copy()
    results['distance'] = distances[0]
    return results[['title', 'description', 'distance']]

In [55]:
# Test with a sample query
query = "nurse"
results = retrieve_jobs(query, model, index)

print("Top job matches for query:")
print(results)


import ollama




Top job matches for query:
                                                 title  \
735            Specialist nurse (m/f/d) skilled worker   
760   Certified nursing specialist (m/f/d) immediately   
905                                 Nurse (m/f/d) care   
2166                                 Nursing assistant   
2165     Nursing specialist on night duty, night watch   

                                            description  distance  
735   Are you looking for a professional reorientati...  1.025185  
760   From craftsmen to production and warehouse emp...  1.025958  
905   Are you looking for a professional reorientati...  1.044787  
2166  The focus of our work at the AWO is always the...  1.108999  
2165  The focus of our work at AWO is always people,...  1.118136  


In [67]:
import ollama
model_name = "llama2"  # or any model name you're using


# Simple prompt to test text generation capability with clearer instructions
test_prompt = "give jobs in thuringia. Do not include any metadata or extra information."

# Use Ollama's `generate` function to query the model.
test_response = ollama.generate(model=model_name, prompt=test_prompt)

# Initialize output text
test_generated_text = "Test Summary:\n"  # Adding a label to the generated text

# Collecting and filtering the test response
try:
    print("Raw response from Ollama for test prompt:")
    print(f"Type of test_response: {type(test_response)}")  # Check the type of the response

    if isinstance(test_response, dict):
        # Check if 'response' key exists in the dictionary
        if 'response' in test_response:
            test_generated_text += test_response['response'].strip() + "\n"
        else:
            print("No 'response' key found in the dictionary.")
    else:
        print("Unexpected response format. Please check the model output.")

except Exception as e:
    print(f"An error occurred while generating test text: {e}")

# Output the test generated text
print(test_generated_text)


Raw response from Ollama for test prompt:
Type of test_response: <class 'dict'>
Test Summary:
1. Software Developer
2. Mechanical Engineer
3. Marketing Manager
4. Sales Representative
5. IT Consultant
6. Operations Manager
7. Human Resources Manager
8. Graphic Designer
9. Web Developer
10. Accountant

