# Import packages

In [1]:
#-*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from sklearn import linear_model # packages for the logistic regression function to plot the logistic regression 
from sklearn.linear_model import LogisticRegression # packages for the logistic regression function to plot the logistic regression 
from scipy import stats
from scipy.stats.stats import pearsonr # Pearson's correlation
from pandas.core.frame import DataFrame as DF
from copy import copy as copy
import operator as operator
import pylab
import scipy.io as spi

# Show graphs in Notebook
%matplotlib inline

# linking to files in notebooks
from IPython.display import FileLink, FileLinks

# Plotting tools
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%pylab inline
figsize(15, 15);

# Set up interface with R
import rpy2
%reload_ext rpy2.ipython

# GLM in python
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as sm_stats

# Make it easy to set and find values in a multi-index DF
idx = pd.IndexSlice

# Useful to quickly extract csv files
import glob

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


# Homemade functions

In [2]:
def ExtractConf(FullData, Bin, Correct, Response):
    Conf = FullData.loc[(FullData['CorrectKey']== Correct) & (FullData['Conf_Bin']==Bin)].groupby('participant')['Response'].value_counts()
    return Conf.loc[slice(None), Response]

# This is the final function thet is using the other functions to generate the binned data, it requires three parameters,
# group (monolingual or bilingual, or more generally S1 or S2) correct (left or right, or S1 or S2), and number of bins
# for the confidence.
def GenConfBins(FullData, Correct):
    Bins=int(FullData['Conf_Bin'].max())
#     print (Bins)
    # First we create an output array with twice the amount of  columns that there are bins (first for when the
    # participants answer S1, and then for when they answer S2) and 31 rows (one per participant in each group)
    OutputArray = [ [1]*(Bins*2) for i in FullData['participant'].unique()]
    # The data is generated in order, from left to right, both in terms of responses and the positions in the output
    
    #Starting with the left, S1, responses, we loop through the bins:
    for Bin in range(Bins):
        # We generate a pandas series for the binned data in this particular confidence/response bin for each participant
        # The (Bins-Bin) expression means that we start with the highest value bin and work our way down
        inputseries = ExtractConf(FullData, (Bins - Bin), Correct, 'left')
        # We generate a list of the participants who have a non-zero value in this bin
        participants = inputseries.index.get_values()
#         print ( participants)
        # A loop that maps the binned value of a given participant to the correct place in the output array
        for participant in participants:
            OutputArray[participant-1][Bin] = 1 + inputseries.loc[participant]
            
    #Next we do the right, S2, responses, we loop through the bins:
    for Bin in range(Bins):
        # We generate a pandas series for the binned data in this particular confidence/response bin for each participant
        # On this side we start with the lowest confidence bin and work our way up
        inputseries = ExtractConf(FullData, Bin+1, Correct, 'right')
        # We generate a list of the participants who have a non-zero value in this bin
        participants = inputseries.index.get_values()
        # A loop that maps the binned value of a given participant to the correct place in the output array
        # The (Bins+Bin) expression is because we have already filled the first set of array with the left (S1) values
        for participant in participants:
            OutputArray[participant-1][(Bins+Bin)] = 1 + inputseries.loc[participant]
                     
    return OutputArray

In [3]:
def zscore(data, old_var):
    output = ((data[old_var] - data[old_var].mean())/
              data[old_var].std())
    return output.values

In [4]:
def participant_zscore(data, old_var, sort_var):
    data2 = data.set_index(sort_var).copy()
    output = ((data2[old_var] - data2.groupby(level=sort_var)[old_var].mean())/
              data2.groupby(level=sort_var)[old_var].std())
    return output.values

# Making the dataframe

<font size=4>[Skip this section and load pre-cleaned and combined data](#load_data)</font>

## Loading the data

In [5]:
data_path = "C:\\Users\\user\\Desktop\\실험 데이터\\Data\\Main"
file_list = glob.glob("C:\\Users\\user\\Desktop\\실험 데이터\\Data\\Main\\With name\\*.csv")
#print (file_list)
data_frames = [pd.read_csv(file, encoding = "ISO-8859-1") for file in file_list]
# print (data_frames)

## Cleaning dataframe

In [6]:
clean = []
for data_frame in data_frames:
    columns = data_frame.columns[13:16] | data_frames[0].columns[21:23] | data_frame.columns[32:33] | data_frames[0].columns[23:25]
    print (columns)
#     ORIGINAL - columns = data_frame.columns[13:21] | data_frames[0].columns[23:25] | data_frame.columns[27:32] | data_frame.columns[32:33] 
#     data_frame.columns[35:36]
    main_trials_start = data_frame.loc[data_frame['Instructions4Response.keys'] == 'space'].index[0]+1
    clean.append(data_frame.loc[main_trials_start:, columns].copy())
#     print(clean)
#    print ('participant', data_frame['participant'].mean())
#     print (columns)
#     print (data_frame)

Index(['Confidence', 'Confidence.RT', 'CorrectKey',
       'Instructions4Response.keys', 'Instructions4Response.rt', 'RT',
       'Response', 'participant'],
      dtype='object')
Index(['Confidence', 'Confidence.RT', 'CorrectKey',
       'Instructions4Response.keys', 'Instructions4Response.rt', 'RT',
       'Response', 'participant'],
      dtype='object')
Index(['Confidence', 'Confidence.RT', 'CorrectKey',
       'Instructions4Response.keys', 'Instructions4Response.rt', 'RT',
       'Response', 'participant'],
      dtype='object')
Index(['Confidence', 'Confidence.RT', 'CorrectKey',
       'Instructions4Response.keys', 'Instructions4Response.rt', 'RT',
       'Response', 'participant'],
      dtype='object')
Index(['Confidence', 'Confidence.RT', 'CorrectKey',
       'Instructions4Response.keys', 'Instructions4Response.rt', 'RT',
       'Response', 'participant'],
      dtype='object')
Index(['Confidence', 'Confidence.RT', 'CorrectKey',
       'Instructions4Response.keys', 'Instructio

In [7]:
data = pd.concat(clean)
print (data.columns)
print (data['participant'].unique())

Index(['Confidence', 'Confidence.RT', 'CorrectKey',
       'Instructions4Response.keys', 'Instructions4Response.rt', 'RT',
       'Response', 'participant'],
      dtype='object')
[10 11 12 13 14 15 16 17 18 19  1 20 21 22 23 24 25 26 27 28 29  2 30 31
 32 33  3  4  5  6  7  8  9]


## Preparing the data for matlab

### Binning the data at the participant level

In [8]:
data = data.sort_values(by=['participant'],ascending=[True])
print (data['participant'].unique())
data['Conf_Bin'] = np.nan
# print (data.columns)
# save concatinated data
# df = pd.DataFrame(data, columns = data.columns)
# print (df)
# df.to_csv('example.csv')

for participant in data.participant.unique():
    data.loc[data['participant']==participant, 'Conf_Bin'] = pd.cut(data.loc[data['participant']==participant, 'Confidence'], 3, labels=range(1, 4)).values
#     print (participant)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33]


In [9]:
data['Conf_Bin'].value_counts()
print (data['participant'].unique())
print (data['Conf_Bin'].value_counts())

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33]
2.0    2458
3.0    2368
1.0    1774
Name: Conf_Bin, dtype: int64


### Preparing the dataframes

In [10]:
# participant_nr = 1
# for participant in data.participant.unique():
#     print (participant)
#     data.loc[data['participant']==participant, 'participant'] = participant_nr
#     participant_nr += 1
# data['participant'].unique()
print (data['participant'].unique())
print (data)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33]
     Confidence  Confidence.RT CorrectKey Instructions4Response.keys  \
19         0.92          2.170      right                        NaN   
152        1.00          1.071      right                        NaN   
153        0.68          2.170      right                        NaN   
154        1.00          1.021       left                        NaN   
155        1.00          1.654       left                        NaN   
156        0.84          3.338       left                        NaN   
157        0.70          2.820      right                        NaN   
158        0.16          2.688       left                        NaN   
159        0.38          1.904       left                        NaN   
160        0.76          1.354      right                        NaN   
161        0.62          1.388      right                        NaN   
151        0.58          1.138    

In [11]:
data_S1 = GenConfBins(data, 'left')
# print (data_S1)

In [12]:
data_S2 = GenConfBins(data, 'right')
data_S2

[[13, 20, 12, 18, 19, 31],
 [19, 8, 17, 17, 5, 48],
 [1, 11, 3, 6, 82, 10],
 [1, 6, 7, 9, 64, 28],
 [4, 13, 9, 11, 40, 26],
 [8, 21, 20, 9, 38, 15],
 [2, 6, 8, 11, 29, 50],
 [3, 7, 28, 25, 37, 8],
 [6, 37, 8, 13, 25, 18],
 [1, 8, 12, 33, 33, 25],
 [5, 17, 9, 23, 23, 35],
 [19, 2, 32, 40, 4, 15],
 [9, 11, 20, 29, 26, 12],
 [4, 8, 12, 14, 36, 24],
 [6, 5, 7, 15, 37, 30],
 [3, 23, 12, 25, 43, 10],
 [7, 21, 7, 6, 41, 28],
 [18, 6, 21, 19, 9, 34],
 [8, 1, 29, 26, 6, 37],
 [10, 27, 6, 6, 39, 25],
 [1, 8, 13, 21, 55, 4],
 [3, 17, 15, 31, 33, 15],
 [1, 7, 12, 21, 34, 34],
 [20, 16, 5, 7, 31, 40],
 [13, 11, 14, 21, 9, 27],
 [15, 22, 7, 10, 27, 25],
 [8, 4, 15, 8, 14, 52],
 [12, 3, 16, 17, 7, 50],
 [9, 16, 9, 15, 27, 21],
 [4, 9, 33, 37, 17, 16],
 [17, 25, 1, 4, 35, 36],
 [30, 2, 2, 5, 2, 64],
 [5, 8, 3, 4, 25, 65]]

In [13]:
data_output = dict(zip(['S1', 'S2'], [data_S1, data_S2]))

### Checking that the output looks sensible

In [14]:
data_output

{'S1': [[47, 20, 16, 4, 8, 4],
  [54, 7, 20, 4, 5, 8],
  [1, 46, 5, 5, 40, 2],
  [5, 33, 9, 13, 32, 5],
  [18, 44, 15, 6, 20, 6],
  [37, 33, 18, 4, 7, 2],
  [13, 32, 16, 15, 13, 17],
  [13, 34, 35, 15, 6, 1],
  [35, 49, 8, 7, 5, 1],
  [14, 28, 20, 20, 13, 5],
  [27, 29, 15, 10, 13, 6],
  [51, 5, 38, 4, 1, 1],
  [23, 28, 35, 9, 7, 3],
  [23, 29, 23, 13, 17, 9],
  [29, 23, 18, 15, 21, 6],
  [13, 29, 30, 11, 11, 2],
  [24, 43, 7, 2, 20, 6],
  [43, 9, 36, 11, 1, 5],
  [44, 6, 30, 11, 3, 11],
  [38, 40, 3, 1, 13, 4],
  [4, 40, 27, 24, 13, 2],
  [13, 36, 19, 17, 9, 4],
  [10, 25, 27, 15, 18, 8],
  [49, 22, 3, 3, 7, 9],
  [55, 14, 22, 13, 6, 7],
  [62, 25, 3, 4, 9, 3],
  [36, 13, 26, 10, 10, 16],
  [45, 7, 31, 12, 7, 5],
  [38, 32, 19, 12, 8, 6],
  [33, 19, 28, 13, 2, 1],
  [39, 34, 2, 2, 7, 10],
  [74, 5, 2, 1, 1, 24],
  [31, 24, 5, 9, 16, 17]],
 'S2': [[13, 20, 12, 18, 19, 31],
  [19, 8, 17, 17, 5, 48],
  [1, 11, 3, 6, 82, 10],
  [1, 6, 7, 9, 64, 28],
  [4, 13, 9, 11, 40, 26],
  [8, 21, 20,

### Saving the output

In [15]:
spi.savemat('C:\\Users\\user\\Desktop\\metaExpdata.mat', data_output)
# open from matlab

## Loading the data from Matlab

In [16]:
# # Loading the matlab files
# meta_conf = spi.loadmat("C:\\Users\\user\\Desktop\\metaExpdata.mat")

# # Extracting variables
# meta_da = meta_conf['fit']['meta_da'].tolist()[0][0][0]
# da = meta_conf['fit']['da'].tolist()[0][0][0]
# mratio = meta_conf['fit']['Mratio'].tolist()[0][0][0]

# #Generating the DF
# meta_data = DF([da, meta_da, mratio])
# meta_data = meta_data.transpose()
# meta_data.columns = ['Da', 'Meta_Da', 'Mratio']
# meta_data['participant'] = np.append(range(1, 31), range(71, 73))
# meta_data['Mrato_Log'] = np.log(meta_data['Mratio'])
# meta_data['Group'] = 'Monolingual'


In [17]:
# meta_data = pd.concat([meta_data], ignore_index=True)