In [19]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Hypothetical refugee data (replace this with actual data)
refugee_data = pd.DataFrame({
    'Refugee_ID': [1, 2, 3],
    'Skills': ['Programming, Data Analysis', 'Teaching, Language Translation', 'Engineering, Project Management'],
    'Language': ['English, French', 'Arabic, Spanish', 'English, German'],
    # Add more relevant columns for education, experience, etc.
})

In [5]:
# Hypothetical job/course/community data (replace this with actual data)
jobs_data = pd.DataFrame({
    'Job_ID': [101, 102, 103],
    'Description': ['Python Developer', 'Language Course in Spanish', 'Engineering Community'],
    'Skills_Required': ['Python, Data Analysis', 'Spanish', 'Engineering'],
    'Language_Required': ['English', 'Spanish', 'German'],
    # Add more relevant columns for location, community features, etc.
})

In [14]:
# Split the data into training and test sets
X = refugee_data[['Skills', 'Language']]  # Features
y = jobs_data[['Description']]   # Labels (jobs, courses, communities)

# Preprocess skills and language columns to convert text data to numerical format
vectorizer_skills = CountVectorizer(tokenizer=lambda x: x.split(', '))
skills_matrix = vectorizer_skills.fit_transform(refugee_data['Skills'])
numerical_skills = pd.DataFrame(skills_matrix.toarray(), columns=vectorizer_skills.get_feature_names_out())

vectorizer_language = CountVectorizer(tokenizer=lambda x: x.split(', '))
language_matrix = vectorizer_language.fit_transform(refugee_data['Language'])
numerical_language = pd.DataFrame(language_matrix.toarray(), columns=vectorizer_language.get_feature_names_out())



In [15]:
# Merge the numerical data
numerical_data = pd.concat([numerical_skills, numerical_language], axis=1)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Use this numerical data for training the model
knn_model = NearestNeighbors(n_neighbors=1)
knn_model.fit(numerical_data)

In [22]:
# TF-IDF Vectorization for text data
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(', '))
refugee_skills_tfidf = tfidf.fit_transform(refugee_data['Skills'])
opportunity_skills_tfidf = tfidf.transform(jobs_data['Skills_Required'])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(refugee_skills_tfidf, opportunity_skills_tfidf)

# Find best matching opportunities for each refugee
for i, row in enumerate(similarity_matrix):
    best_match_index = row.argmax()
    best_opportunity = jobs_data.iloc[best_match_index]
    print(f"Best matching opportunity for Refugee ID {i+1}: {best_opportunity['Description']}")


Best matching opportunity for Refugee ID 1: Python Developer
Best matching opportunity for Refugee ID 2: Python Developer
Best matching opportunity for Refugee ID 3: Engineering Community


