In [2]:
import pandas as pd
import random
import numpy as np
import pycountry
import matplotlib.pyplot as plt
import seaborn as sns
import csv

from sklearn.preprocessing import MultiLabelBinarizer

In [59]:
# data "schema":
#     age; int
#     interests; list of strings
#     languages; list of strings
#     intent; 0 -> looking for, 1 -> offering help
#     help_type; string

languages_options_0 = [
    ['ukrainian'],
    ['ukrainian'],
    ['english'],
    ['english', 'russian'],
    ['english', 'russian', 'ukrainian'],
    ['english', 'ukrainian'],
    ['russian', 'ukrainian'],
    ['german', 'ukrainian'],
]
languages_options_1 = [
    ['german'],
    ['german', 'english'],
    ['german', 'english'],
    ['english'],
    ['german', 'ukrainian'],
    ['german', 'russian'],
    ['german', 'english', 'ukrainian'],
    ['english', 'russian', 'ukrainian'],
]

interests = ["football", "basketball", "food", "music", "art", "tours", "community", "languages", "games", "pets", "literature", "cooking", "coding"]
help_types = ["advice", "meet people", "things to do", "hangouts", "market place", "forum"]

raw_data = []

for x in range(100):
    intent = random.randint(0,1)
    raw_data.append({
        "id": x,
        "age": random.randint(18,99),
        "interests": random.choice(interests),
        "languages": random.choice(languages_options_1 if intent else languages_options_0),
        "intent": intent,
        "help_type": random.choice(help_types)
    })
    
for person in raw_data:
    intent = person['intent']
    help_type = person['help_type']
    matches = []
#     make matches
    for other_person in raw_data:
#         Match those offering with those needing help
        intent_match = intent != other_person['intent']
#         Match same type of help needed and offerd
        type_match = help_type == other_person['help_type']
#         Match at least one language in common
        language_match = bool(set(person['languages']).intersection(other_person['languages']))
        if (intent_match and type_match and language_match):
            matches.append(other_person['id'])
    person['matches'] = matches


df = pd.DataFrame(raw_data)
df.to_csv("data_with_matches.csv", index=False)
df


Unnamed: 0,id,age,interests,languages,intent,help_type,matches
0,0,25,games,"[german, english]",1,market place,"[1, 61, 83, 87, 94]"
1,1,91,basketball,"[english, ukrainian]",0,market place,"[0, 9, 13, 18, 24, 50, 53, 75, 81, 88]"
2,2,93,football,[german],1,market place,[]
3,3,21,basketball,"[english, russian, ukrainian]",1,advice,"[10, 28, 51, 52, 77, 79, 86, 93]"
4,4,72,football,[german],1,meet people,[91]
...,...,...,...,...,...,...,...
95,95,60,literature,"[english, russian, ukrainian]",0,forum,"[11, 21, 29, 42, 48, 55, 58, 64, 71, 76, 84]"
96,96,74,football,"[english, russian, ukrainian]",0,hangouts,"[12, 44, 60, 74]"
97,97,63,basketball,"[english, ukrainian]",0,meet people,"[30, 40, 46, 90, 98]"
98,98,85,basketball,"[german, english]",1,meet people,"[22, 26, 36, 47, 70, 91, 97]"


In [60]:
# Make each language into it's own label from a list of strings
mlb = MultiLabelBinarizer()

df_langs_separate = pd.DataFrame(mlb.fit_transform(df['languages']),columns=mlb.classes_, index=df.index)
df_langs_separate

for lang_col in df_langs_separate:
    df[lang_col] = df_langs_separate[lang_col]

df

Unnamed: 0,id,age,interests,languages,intent,help_type,matches,english,german,russian,ukrainian
0,0,25,games,"[german, english]",1,market place,"[1, 61, 83, 87, 94]",1,1,0,0
1,1,91,basketball,"[english, ukrainian]",0,market place,"[0, 9, 13, 18, 24, 50, 53, 75, 81, 88]",1,0,0,1
2,2,93,football,[german],1,market place,[],0,1,0,0
3,3,21,basketball,"[english, russian, ukrainian]",1,advice,"[10, 28, 51, 52, 77, 79, 86, 93]",1,0,1,1
4,4,72,football,[german],1,meet people,[91],0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
95,95,60,literature,"[english, russian, ukrainian]",0,forum,"[11, 21, 29, 42, 48, 55, 58, 64, 71, 76, 84]",1,0,1,1
96,96,74,football,"[english, russian, ukrainian]",0,hangouts,"[12, 44, 60, 74]",1,0,1,1
97,97,63,basketball,"[english, ukrainian]",0,meet people,"[30, 40, 46, 90, 98]",1,0,0,1
98,98,85,basketball,"[german, english]",1,meet people,"[22, 26, 36, 47, 70, 91, 97]",1,1,0,0


In [61]:
df.to_csv("data_with_lang_labels.csv", index=False)