In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import pickle

In [5]:
X_train = pd.read_csv('data/train/x_train.csv')
y_train = pd.read_csv('data/train/y_train.csv')
X_test = pd.read_csv('data/test/x_test.csv')

In [6]:
import pandas as pd
import ast  # To safely evaluate strings as Python literals

# Define action scores
action_scores = {'view': 1, 'apply': 5}

def process_row(df):
    interaction_matrix = {}
    # Process each row in the DataFrame
    for _, row in df.iterrows():
        session_id = row['session_id']
        job_ids = ast.literal_eval(row['job_ids'])  # Convert string to list
        actions = ast.literal_eval(row['actions'])  # Convert string to list

        # Iterate through job_ids and actions
        for job_id, action in zip(job_ids, actions):
            if session_id not in interaction_matrix:
                interaction_matrix[session_id] = {}  # Initialize dictionary for the user
            if job_id not in interaction_matrix[session_id]:
                interaction_matrix[session_id][job_id] = action_scores[action]   # Initialize score for the item

    # Convert the interaction matrix to a DataFrame
    interaction_df = pd.DataFrame.from_dict(interaction_matrix, orient='index')

    # # Fill missing values with 0 (no interaction)
    # interaction_df = interaction_df.fillna(0)

    interaction_df = interaction_df.reindex(sorted(interaction_df.columns), axis=1)
    
    return interaction_df

In [7]:
interaction_df_train = process_row(X_train)
interaction_df_test = process_row(X_test)
merged_df = pd.concat([interaction_df_train, interaction_df_test], axis=0)
merged_df 

Unnamed: 0,0,1,2,3,5,6,7,8,9,10,...,27218,27235,27241,27245,27273,27274,27292,27308,27364,27366
0,,,,,,,,,,,...,,,,,,,,,,
369,,,,,,,,,,,...,,,,,,,,,,
854,,,,,,,,,,,...,,,,,,,,,,
1180,,,,,,,,,,,...,,,,,,,,,,
2627,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1810,,,,,,,,,,,...,,,,,,,,,,
1813,,,,,,,,,,,...,,,,,,,,,,
1814,,,,,,,,,,,...,,,,,,,,,,
1817,,,,,,,,,,,...,,,,,,,,,,


In [10]:
data = merged_df.to_numpy()
mask = ~np.isnan(data)
sparse_matrix = sp.csr_matrix(np.where(mask, data, 0))  # Replace NaNs with 0 for sparse storage

# Save sparse matrix
with open("data/sparse_matrix.pkl", "wb") as f:
    pickle.dump(sparse_matrix, f)

# Number of non-NaN values
num_non_nan = np.count_nonzero(mask)

# Distribution of values (count of NaN, 1, and 5)
nan_count = np.sum(np.isnan(data))
count_1 = np.sum(data == 1)
count_5 = np.sum(data == 5)

total_elements = data.size
sparsity = (total_elements - num_non_nan) / total_elements * 100

# Print results
print("Number of non-NaN values:", num_non_nan)
print("NaN count:", nan_count)
print("Count of 1s:", count_1)
print("Count of 5s:", count_5)
print(f"Sparsity percentage: {sparsity:.2f}%")


Number of non-NaN values: 132951
NaN count: 379535798
Count of 1s: 84271
Count of 5s: 48680
Sparsity percentage: 99.96%


In [15]:
import json
from mistralai import Mistral
import os

In [None]:
import json
from mistralai import Mistral
import os

with open("data/job_listings.json", 'r') as file:
    data = json.load(file)
job_ids = list(data.keys())
print(len(data))

21917


In [28]:
for job_id in job_ids[300:310]:
    print(data[job_id])
    print("@" * 50)

TITLE
Product Owner Digital Workplace F/H

SUMMARY
La tribu Digital Workplace est constituée d'exploitants, administrateurs, experts dans les technologies du poste de travail.
Au sein de cette tribu, vous interviendrez comme Product Owner et vous serez le relai du rendu du service et des décisions liées au produit.
Pour cela, vos missions seront :
-          Contribuer à la définition et à la diffusion de la roadmap produit, à la planification des itérations à venir, travailler avec l'équipe 
-          Être responsable du backlog sur les activités, affiner et prioriser avec les parties prenantes métiers
-          Gérer le contact et l'alignement des parties prenantes internes et externes
-          Vérifier que les livraisons du produit correspondent aux besoins des utilisateurs
-          Agréger, analyser et communiquer sur l'avancement de la réalisation et les résultats obtenus
-          Contribuer a? la définition et la mise en oeuvre des activités de conduit

In [33]:
os.getenv

<function os.getenv(key, default=None)>

In [None]:
from dotenv import load_dotenv
import json
from mistralai import Mistral
import os

load_dotenv(".env")

with open("data/job_listings.json", 'r') as file:
    data = json.load(file)
job_ids = list(data.keys())

api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key)
# Get job descriptions for each job ID, extract only main information
job_descriptions = {}

# Open a JSON file to write the responses
with open('job_descriptions.json', 'w') as json_file:
    json_file.write('[\n')  # Start of JSON array

    for i, job_id in enumerate(job_ids[1968:]):
        prompt = [{
            "role": "system",
            "content": """You are a tool designed to extract and summarize key information from job descriptions. Your output will be used for embeddings, so focus on precision and conciseness. Use bullet points instead of complete sentences where appropriate.
                    write the result in french, don't use special character like: \u00e9 or other.
                    For each job description, extract and organize the following information:
                    Job Title:
                    Extract the job title.
                    Job Summary:
                    Summarize the job description in 3-4 concise bullet points.
                    Required Skills:
                    List the key technical and soft skills required for the job.
                    Responsibilities:
                    Highlight the main tasks and responsibilities associated with the role.
                    Qualifications:
                    List the educational background, certifications, and experience required.
                    Company Value Proposition:
                    Highlight any benefits, perks, or unique offerings mentioned by the company.
                    Company Culture:
                    Describe the company culture, if mentioned (e.g., work environment, values, or mission).
                    If any of the above information is not available, skip that section. Prioritize clarity and brevity in your output.
                    """
        },
        {
            "role": "user",
            "content": "Here is a job description: " + data[job_id]
        }]

        resp = client.chat.complete(
            model="mistral-large-latest",
            messages=prompt
        )

        job_descriptions[job_id] = resp.choices[0].message.content

        # Write each response to the JSON file
        json.dump({job_id: resp.choices[0].message.content}, json_file)
        if i < len(job_ids) - 1:  # Add a comma after each item except the last one
            json_file.write(',\n')

    json_file.write('\n]')

print(job_descriptions)

KeyboardInterrupt: 

In [11]:
print(data["1"])

TITLE
Ingénieur Système

SUMMARY
Nous recherchons un Ingénieur Système pour notre client. Le candidat idéal devra posséder des compétences variées en environnement système, virtualisation, Cloud et DevOps, et être capable de travailler dans des environnements hybrides et publics.
Ce poste offre une opportunité de contribuer activement à la gestion des infrastructures et à l'automatisation des processus au sein d'une équipe dynamique.
Gestion des Environnements Système :
Administrer et maintenir des systèmes Linux et Windows.
Gérer des environnements de virtualisation (VMware, hyperviseurs).
Cloud et Virtualisation :
Travailler avec des environnements Cloud Public (Azure, GCP).
Assurer l’intégration et la gestion des ressources Cloud dans des environnements hybrides.
Middleware et Bases de Données :
Administrer des serveurs middleware (Apache, Tomcat).
Gérer des bases de données SQL Server et assurer leur performance et leur sécurité.
DevOps et Automatisation :
Me