In [1]:
import pandas as pd
import random
import numpy as np
import pycountry
import matplotlib.pyplot as plt
import seaborn as sns
import csv

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# data "schema":
#     age; int
#     interests; list of strings
#     languages; list of strings
#     intent; 0 -> looking for, 1 -> offering help
#     help_type; string

languages_options_0 = [
    ['ukrainian'],
    ['ukrainian'],
    ['english'],
    ['english', 'russian'],
    ['english', 'russian', 'ukrainian'],
    ['english', 'ukrainian'],
    ['russian', 'ukrainian'],
    ['german', 'ukrainian'],
]
languages_options_1 = [
    ['german'],
    ['german', 'english'],
    ['german', 'english'],
    ['english'],
    ['german', 'ukrainian'],
    ['german', 'russian'],
    ['german', 'english', 'ukrainian'],
    ['english', 'russian', 'ukrainian'],
]

interests = ["football", "basketball", "food", "music", "art", "tours", "community", "languages", "games", "pets", "literature", "cooking", "coding"]
help_types = ["advice", "meet people", "things to do", "hangouts", "market place", "forum"]

raw_data = []

for x in range(3000):
    intent = random.randint(0,1)
    raw_data.append({
        "id": x,
        "age": random.randint(18,99),
        "interests": random.choice(interests),
        "languages": random.choice(languages_options_1 if intent else languages_options_0),
        "intent": intent,
        "help_type": random.choice(help_types)
    })
    
for person in raw_data:
    intent = person['intent']
    help_type = person['help_type']
    matches = []
#     make matches
    for other_person in raw_data:
#         Match those offering with those needing help
        intent_match = intent != other_person['intent']
#         Match same type of help needed and offerd
        type_match = help_type == other_person['help_type']
#         Match at least one language in common
        language_match = bool(set(person['languages']).intersection(other_person['languages']))
        if (intent_match and type_match and language_match):
            matches.append(other_person['id'])
    person['matches'] = matches


df = pd.DataFrame(raw_data)
df.to_csv("data_with_matches.csv", index=False)
df


Unnamed: 0,id,age,interests,languages,intent,help_type,matches
0,0,59,art,"[english, russian, ukrainian]",1,market place,"[5, 27, 94, 99, 114, 129, 137, 173, 176, 197, ..."
1,1,71,coding,"[german, ukrainian]",1,forum,"[6, 7, 14, 16, 32, 37, 52, 56, 73, 86, 111, 13..."
2,2,54,music,[german],1,hangouts,"[31, 256, 610, 729, 738, 844, 939, 955, 965, 1..."
3,3,22,tours,"[german, russian]",1,meet people,"[25, 43, 62, 135, 183, 184, 190, 274, 279, 286..."
4,4,68,pets,"[english, russian, ukrainian]",1,meet people,"[9, 25, 43, 59, 62, 97, 135, 157, 175, 180, 18..."
...,...,...,...,...,...,...,...
2995,2995,95,art,[english],0,meet people,"[4, 29, 54, 61, 67, 83, 101, 103, 108, 117, 12..."
2996,2996,61,art,"[german, russian]",1,hangouts,"[13, 31, 78, 102, 139, 192, 209, 253, 256, 280..."
2997,2997,71,cooking,"[german, ukrainian]",1,market place,"[5, 27, 94, 99, 114, 129, 137, 173, 197, 203, ..."
2998,2998,86,coding,"[german, ukrainian]",1,meet people,"[9, 25, 43, 59, 62, 97, 135, 157, 175, 180, 18..."


In [3]:
# Make each language into it's own label from a list of strings
mlb = MultiLabelBinarizer()

df_langs_separate = pd.DataFrame(mlb.fit_transform(df['languages']),columns=mlb.classes_, index=df.index)
df_langs_separate

for lang_col in df_langs_separate:
    df[lang_col] = df_langs_separate[lang_col]

df

Unnamed: 0,id,age,interests,languages,intent,help_type,matches,english,german,russian,ukrainian
0,0,59,art,"[english, russian, ukrainian]",1,market place,"[5, 27, 94, 99, 114, 129, 137, 173, 176, 197, ...",1,0,1,1
1,1,71,coding,"[german, ukrainian]",1,forum,"[6, 7, 14, 16, 32, 37, 52, 56, 73, 86, 111, 13...",0,1,0,1
2,2,54,music,[german],1,hangouts,"[31, 256, 610, 729, 738, 844, 939, 955, 965, 1...",0,1,0,0
3,3,22,tours,"[german, russian]",1,meet people,"[25, 43, 62, 135, 183, 184, 190, 274, 279, 286...",0,1,1,0
4,4,68,pets,"[english, russian, ukrainian]",1,meet people,"[9, 25, 43, 59, 62, 97, 135, 157, 175, 180, 18...",1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,95,art,[english],0,meet people,"[4, 29, 54, 61, 67, 83, 101, 103, 108, 117, 12...",1,0,0,0
2996,2996,61,art,"[german, russian]",1,hangouts,"[13, 31, 78, 102, 139, 192, 209, 253, 256, 280...",0,1,1,0
2997,2997,71,cooking,"[german, ukrainian]",1,market place,"[5, 27, 94, 99, 114, 129, 137, 173, 197, 203, ...",0,1,0,1
2998,2998,86,coding,"[german, ukrainian]",1,meet people,"[9, 25, 43, 59, 62, 97, 135, 157, 175, 180, 18...",0,1,0,1


In [4]:
df.to_csv("data_with_lang_labels.csv", index=False)