# Librairy

In [1]:
import pandas as pd
import numpy as np
from IPython.display import HTML, display, Markdown, Pretty


In [2]:
def throw_double_answer(rows_to_keep, df, key) :
    throw_id = []
    df_id = df.groupby("id_participant").count()
    for id in df_id.index :
        if df_id.loc[id][key] > rows_to_keep :
            throw_id.append(id)

    for id in throw_id :    
        rows = df[df.id_participant == id].shape[0]
        last_rows = df[df.id_participant == id].tail(rows-rows_to_keep)
        df = df[~df.isin(last_rows)].dropna(how='all')
    return df

def can_be_converted_to_integer(column):
    try:
        column.astype(int)
        return True
    except ValueError:
        return False

def throw_bad_answers(df,out):
    #collect lines corresponding in data
    index = []
    for i in range(len(out)):
        index.append(df.index[df["id_participant"] == out[i]].tolist())
    #drop lines
    for i in range(len(index)):
        for j in range(len(index[i])):
            df.drop(index[i][j], inplace=True)

# Data cleaning

In [3]:
#Import the data
data = pd.read_csv(
    "https://ethicallychoice.alwaysdata.net/wp-content/data.csv", sep=";")
print(data.shape)

#drop the empty line
data = data.dropna()
#convert to int all the datas

print(data.shape)
#throw repeted answers from same id
data = throw_double_answer(27, data, "personnage_1")
print(data.shape)

#convert value to int
for col in data.columns :
    if can_be_converted_to_integer(data[col]) :
        data[col] = data[col].astype(int)

data

(5481, 14)
(5400, 14)
(5076, 14)


Unnamed: 0,personnage_1,personnage_2,force_1,force_2,scenarios,task_1,task_2,repetition_question,id_participant,age,sexe,taille,enfant,category
0,1,5,7,3,4,8,2,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
1,5,9,3,6,2,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
2,5,8,3,8,3,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
3,3,5,10,3,9,0,10,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
4,8,9,8,6,7,5,5,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5422,1,5,15,11,1,5,5,3,1d807577b2a921cef642c0f8c7cc626c,22,Homme,175,Non,student
5423,1,4,15,1,9,5,5,3,1d807577b2a921cef642c0f8c7cc626c,22,Homme,175,Non,student
5424,1,9,15,12,8,5,5,3,1d807577b2a921cef642c0f8c7cc626c,22,Homme,175,Non,student
5425,1,2,15,13,5,6,4,3,1d807577b2a921cef642c0f8c7cc626c,22,Homme,175,Non,student


# Data informations

In [4]:
#we separate the 2 different group of participant
student_data = data[data.category == "student"]
online_data = data[data.category == "online"]
#number of women in each group
nb_woman_student = student_data[student_data.sexe ==
                                'Femme'].id_participant.count()
nb_woman_online = online_data[online_data.sexe ==
                              'Femme'].id_participant.count()
#number of men in each group
nb_man_student = student_data[student_data.sexe ==
                              'Homme'].id_participant.count()
nb_man_online = online_data[online_data.sexe == 'Homme'].id_participant.count()
#number of others in each group
nb_other_student = student_data[student_data.sexe ==
                                'Autre'].id_participant.count()
nb_other_online = online_data[online_data.sexe ==
                              'Autre'].id_participant.count()
#age mean in each group
mean_age_student = student_data.age.mean()
mean_age_online = online_data.age.mean()
#age std in each group
std_dev_student = np.std(student_data.age, ddof=1)
std_dev_online = np.std(online_data.age, ddof=1)

#display result in Markdown
display(
    Markdown(
        "<table><thead><tr><th></th><th>Student group</th><th>Online Group</th></tr></thead>"
        + "<tbody><tr><td># of participant</td><td>" +
        str(student_data.shape[0] // 27) + " (" + str(student_data.shape[0]) +
        ") </td><td>" + str(online_data.shape[0] // 27) + " (" +
        str(online_data.shape[0]) + ") </td></tr>" +
        "<tr><td># of women</td><td>" + str(nb_woman_student // 27) + " (" +
        str(nb_woman_student) + ") </td><td>" + str(nb_woman_online // 27) +
        " (" + str(nb_woman_online) + ") </td></tr>" +
        "<tr><td># of men</td><td>" + str(nb_man_student // 27) + " (" +
        str(nb_man_student) + ") </td><td>" + str(nb_man_online // 27) + " (" +
        str(nb_man_online) + ") </td></tr>" + "<tr><td># of other</td><td>" +
        str(nb_other_student // 27) + " (" + str(nb_other_student) +
        ") </td><td>" + str(nb_other_online // 27) + " (" +
        str(nb_other_online) + ") </td></tr>" + "<tr><td>Mean age</td><td>" +
        str(int(mean_age_student)) + "</td><td>" +
        str(int(mean_age_online)) + "</td></tr>" +
        "<tr><td>STD age</td><td>" + str(round(std_dev_student, 2)) +
        "</td><td>" + str(round(std_dev_online, 2)) +
        "</td></tr></tbody></table>"))

<table><thead><tr><th></th><th>Student group</th><th>Online Group</th></tr></thead><tbody><tr><td># of participant</td><td>47 (1269) </td><td>141 (3807) </td></tr><tr><td># of women</td><td>13 (351) </td><td>70 (1890) </td></tr><tr><td># of men</td><td>33 (891) </td><td>68 (1836) </td></tr><tr><td># of other</td><td>1 (27) </td><td>3 (81) </td></tr><tr><td>Mean age</td><td>22</td><td>29</td></tr><tr><td>STD age</td><td>1.77</td><td>12.55</td></tr></tbody></table>

# Filtering on focus questions

In [5]:
#Import the concentration question
conc = pd.read_csv(
    "https://ethicallychoice.alwaysdata.net/wp-content/conc.csv", sep=";")
conc.rename(columns={'id': 'id_participant'}, inplace=True)
conc = throw_double_answer(1, conc, "q1")
#convert value to int
for col in conc.columns :
    if can_be_converted_to_integer(conc[col]) :
        conc[col] = conc[col].astype(int)
conc

Unnamed: 0,q1,q1_rep,q2,q2_rep,q3,q3_rep,id_participant
0,2,2,6,5,6,6,d60c0832fc30e645ca04f074c44b49eb
1,2,2,6,5,6,6,0b3df09027f409a78932dae68f21d84a
2,2,3,6,5,6,6,49af6504b50e11aab6a2174242d05ca1
3,2,2,6,5,6,6,95d0e71312fa611b6f257271e0205d67
4,2,2,6,5,6,6,01276ee53afbdb3982262250788dfc1f
...,...,...,...,...,...,...,...
215,2,2,5,5,6,6,e12954e890f1701804c8ccd73d74ca4d
216,2,2,5,5,6,6,09d213e947c7af32e69d193c5165f646
218,2,1,5,5,6,6,d617476c80b6823ed202f5b86916156d
219,2,2,5,5,6,6,dd8fa41c53a3dc0c05053c11cbb43d39


In [6]:
#collect people who are bad answering
out = {}
for i in conc.index:
    for j in range(1, 4):
        if conc["q" + str(j)][i] != conc["q" + str(j) + "_rep"][i]:
            if conc.id_participant[i] not in out:
                out[conc.id_participant[i]] = 1
            else:
                out[conc.id_participant[i]] = out[conc.id_participant[i]] + 1
out = [k for (k, v) in out.items() if v > 1]
throw_bad_answers(data,out)
data

Unnamed: 0,personnage_1,personnage_2,force_1,force_2,scenarios,task_1,task_2,repetition_question,id_participant,age,sexe,taille,enfant,category
0,1,5,7,3,4,8,2,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
1,5,9,3,6,2,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
2,5,8,3,8,3,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
3,3,5,10,3,9,0,10,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
4,8,9,8,6,7,5,5,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5395,4,5,1,11,8,5,5,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student
5396,1,9,11,14,5,3,7,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student
5397,4,5,1,11,2,4,6,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student
5398,2,4,11,1,7,5,5,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student


In [7]:
#We make the assumption that changing the order does not influence people
data2 = data.copy(deep=True)

for s in ["personnage", "force", "task"]:
    data2[s + "_1"] = data[s + "_2"]
    data2[s + "_2"] = data[s + "_1"]

data = pd.concat([data, data2]).reset_index(drop=True)

data

Unnamed: 0,personnage_1,personnage_2,force_1,force_2,scenarios,task_1,task_2,repetition_question,id_participant,age,sexe,taille,enfant,category
0,1,5,7,3,4,8,2,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
1,5,9,3,6,2,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
2,5,8,3,8,3,1,9,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
3,3,5,10,3,9,0,10,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
4,8,9,8,6,7,5,5,1,d60c0832fc30e645ca04f074c44b49eb,57,Femme,169,Oui,online
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8959,5,4,11,1,8,5,5,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student
8960,9,1,14,11,5,7,3,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student
8961,5,4,11,1,2,6,4,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student
8962,4,2,1,11,7,5,5,3,d617476c80b6823ed202f5b86916156d,21,Homme,170,Non,student


# Database arrangement 

In [8]:
data = data[['personnage_1','personnage_2','force_1','force_2','task_1','scenarios','age','sexe','id_participant','repetition_question']]
data

Unnamed: 0,personnage_1,personnage_2,force_1,force_2,task_1,scenarios,age,sexe,id_participant,repetition_question
0,1,5,7,3,8,4,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
1,5,9,3,6,1,2,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
2,5,8,3,8,1,3,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
3,3,5,10,3,0,9,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
4,8,9,8,6,5,7,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
...,...,...,...,...,...,...,...,...,...,...
8959,5,4,11,1,5,8,21,Homme,d617476c80b6823ed202f5b86916156d,3
8960,9,1,14,11,7,5,21,Homme,d617476c80b6823ed202f5b86916156d,3
8961,5,4,11,1,6,2,21,Homme,d617476c80b6823ed202f5b86916156d,3
8962,4,2,1,11,5,7,21,Homme,d617476c80b6823ed202f5b86916156d,3


In [9]:
#attribute the right number to the dataset
for i in range(1,3):
    data["personnage_"+str(i)] = [1 if i == 4 else 2 if i == 1 else 3 if i == 8 else 4 if i == 2 else 5 if i == 9 else 6 if i == 5 else 7 if i == 3 else i for i in data["personnage_"+str(i)]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["personnage_"+str(i)] = [1 if i == 4 else 2 if i == 1 else 3 if i == 8 else 4 if i == 2 else 5 if i == 9 else 6 if i == 5 else 7 if i == 3 else i for i in data["personnage_"+str(i)]]


In [10]:
#attribute the right fetures name to the dataset
features_name = {'personnage_1': 'left_char',
                'personnage_2': 'right_char',
                'force_1': 'strength_left',
                'force_2': 'strength_right',
                'task_1': 'value_left',
                'scenarios': 'scenario',
                'sexe': 'sex'}
data.rename(columns=features_name, inplace=True)
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns=features_name, inplace=True)


Unnamed: 0,left_char,right_char,strength_left,strength_right,value_left,scenario,age,sex,id_participant,repetition_question
0,2,6,7,3,8,4,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
1,6,5,3,6,1,2,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
2,6,3,3,8,1,3,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
3,7,6,10,3,0,9,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
4,3,5,8,6,5,7,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1
...,...,...,...,...,...,...,...,...,...,...
8959,6,1,11,1,5,8,21,Homme,d617476c80b6823ed202f5b86916156d,3
8960,5,2,14,11,7,5,21,Homme,d617476c80b6823ed202f5b86916156d,3
8961,6,1,11,1,6,2,21,Homme,d617476c80b6823ed202f5b86916156d,3
8962,1,4,1,11,5,7,21,Homme,d617476c80b6823ed202f5b86916156d,3


In [11]:
#we add value from repetition 1 & 2
data["value_left_rep1"] = -1
data["value_left_rep2"] = -1

for s in range(1, 10):
        for i in data[data.scenario == s].index:
            if data.loc[i].repetition_question == 1:
                value_left_rep1 = data.loc[i].value_left
            elif data.loc[i].repetition_question == 2:
                data.loc[i, 'value_left_rep1'] = value_left_rep1
                value_left_rep2 = data.loc[i].value_left
            elif data.loc[i].repetition_question == 3:
                data.loc[i, 'value_left_rep1'] = value_left_rep1
                data.loc[i, 'value_left_rep2'] = value_left_rep2
data.id_participant.to_csv('data_id.csv', sep=";", index=False)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["value_left_rep1"] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["value_left_rep2"] = -1


Unnamed: 0,left_char,right_char,strength_left,strength_right,value_left,scenario,age,sex,id_participant,repetition_question,value_left_rep1,value_left_rep2
0,2,6,7,3,8,4,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1,-1,-1
1,6,5,3,6,1,2,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1,-1,-1
2,6,3,3,8,1,3,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1,-1,-1
3,7,6,10,3,0,9,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1,-1,-1
4,3,5,8,6,5,7,57,Femme,d60c0832fc30e645ca04f074c44b49eb,1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
8959,6,1,11,1,5,8,21,Homme,d617476c80b6823ed202f5b86916156d,3,7,8
8960,5,2,14,11,7,5,21,Homme,d617476c80b6823ed202f5b86916156d,3,6,7
8961,6,1,11,1,6,2,21,Homme,d617476c80b6823ed202f5b86916156d,3,9,5
8962,1,4,1,11,5,7,21,Homme,d617476c80b6823ed202f5b86916156d,3,3,2


In [12]:
#we set the good order of features name and throw useless columns
data = data.drop(columns=['repetition_question', 'id_participant'])
data = data[['left_char','right_char','strength_left','strength_right','value_left','scenario','value_left_rep1','value_left_rep2','sex','age']]
data

Unnamed: 0,left_char,right_char,strength_left,strength_right,value_left,scenario,value_left_rep1,value_left_rep2,sex,age
0,2,6,7,3,8,4,-1,-1,Femme,57
1,6,5,3,6,1,2,-1,-1,Femme,57
2,6,3,3,8,1,3,-1,-1,Femme,57
3,7,6,10,3,0,9,-1,-1,Femme,57
4,3,5,8,6,5,7,-1,-1,Femme,57
...,...,...,...,...,...,...,...,...,...,...
8959,6,1,11,1,5,8,7,8,Homme,21
8960,5,2,14,11,7,5,6,7,Homme,21
8961,6,1,11,1,6,2,9,5,Homme,21
8962,1,4,1,11,5,7,3,2,Homme,21


In [13]:
#we associate a number to each gender
data["sex"] = [
    0 if i == "Femme" else 1 if i == "Homme" else 2 for i in data["sex"]
]
data

Unnamed: 0,left_char,right_char,strength_left,strength_right,value_left,scenario,value_left_rep1,value_left_rep2,sex,age
0,2,6,7,3,8,4,-1,-1,0,57
1,6,5,3,6,1,2,-1,-1,0,57
2,6,3,3,8,1,3,-1,-1,0,57
3,7,6,10,3,0,9,-1,-1,0,57
4,3,5,8,6,5,7,-1,-1,0,57
...,...,...,...,...,...,...,...,...,...,...
8959,6,1,11,1,5,8,7,8,1,21
8960,5,2,14,11,7,5,6,7,1,21
8961,6,1,11,1,6,2,9,5,1,21
8962,1,4,1,11,5,7,3,2,1,21


In [14]:
data.to_csv('data.csv', index=False, sep=";")

In [15]:
data.head(4482).to_csv('df.csv', index=False, sep=";")