In [100]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [19]:
def process_log(file, sid):
  """
  From the log file extract the significant parts in different dataframes:
  -subject: personal values of the subject
  -nasa: nasa questionnaire compiled by the subject, reformatted in a new table
  -final_pos: final configuration and correctness for each puzzle
  -moves: list of moves performed by the subject
  """

  entry = {}

  #subject info
  entry['subject'] = pd.read_csv(file, sep = ';', header= None, encoding='iso-8859-1', nrows=4)
  
  df = pd.read_csv(file,sep=';', header=4,skipfooter = 1,encoding='iso-8859-1')
  df['sid'] = sid
  #nasa questionnaire
  nasa=['mental demand','performance','effort','frustration']
  nasa_values= df[df['time'].isin(nasa)]
  entry['nasa'] =pd.pivot_table(nasa_values[['tangram nr', 'time', 'item']], values='item', index=['tangram nr'],
                  columns=['time'])
  
  #final configuration
  entry['final_pos'] = df[df['time']=='solution']

  #moves list
  df_moves= df[~df['time'].isin(nasa)]
  entry['moves'] = df_moves.drop(df_moves[df_moves['time'] == 'solution'].index)


  return entry

def create_moves_dataframe(logs):
  """
  creates a new dataframe which records the moves
  a "move" merges all the intermediate movements and rotations of a single piece
  and ends when a new piece is selected. 
  The values stored are the initial ('from') and final ('position') coordinates
  including x,y and rotation value
  """
  
  # TODO the "from" value does not really store the original position but the position after the first "movement"
  # TODO might require adding the step-value

  out =[]
  moves_df = pd.concat([d['moves'] for d in logs])
  moves_df = moves_df.astype({'time': float})
  moves_df = moves_df[['sid','tangram nr','item','action','position','time']]
  moves_df=  moves_df.loc[moves_df['action']!='moved to correct location']

  last_moves =moves_df.loc[(moves_df['action'].shift(-1) != moves_df['action']) 
                           | (moves_df['tangram nr'].shift(-1) != moves_df['tangram nr'])
                           |(moves_df['item'].shift(-1) != moves_df['item'])
                           ]


 
  return last_moves.reset_index(drop=True)

def create_solution_dataframe(logs):
  sol_df = pd.concat([d['final_pos'] for d in logs])
  sol_df=sol_df.rename(columns = {'item':'solved','action':'config'})
  sol_df=sol_df.astype({'solved':'int'})
  return sol_df[['sid','tangram nr', 'solved', 'config']].reset_index(drop =True)

def create_subjects_dataframe(logs):
  #subjects_df = pd.DataFrame(columns= ["age","gender", "exp", "studies"])
  #for l in logs:
  #  row = l['subject'].values
  #  subjects_df = pd.concat(subjects_df,pd.DataFrame(l['subject'].values)[1].T)
  subjects_df = pd.concat([pd.DataFrame(d['subject'].values)[1] for d in logs], axis=1).T.reset_index(drop=True)
  subjects_df = subjects_df.rename(columns ={0:'age',1:'gender',2: 'experience', 3:'background'})
  return subjects_df



In [93]:
def baseround(x, base=5):
    return base * round(x/base)

def get_grid_value(x,y,tgn):
  solution_limits = {1:[(-260,120),(-120,140)],2:[(-280,-20),(-80,200)],3:[(-320,60),(-140,140)],4: [(-280,0),(-200,300)]}
  xrange = solution_limits.get(tgn)[0]
  yrange = solution_limits.get(tgn)[1]
  xstep=(xrange[1]-xrange[0])/4
  ystep=(yrange[1]-yrange[0])/4
  if (x not in range(xrange[0],xrange[1]+1) or y not in range(yrange[0],yrange[1]+1)):
    return -1
  xgrid = (x-xrange[0]) // xstep

  ygrid = (y-yrange[0]) // ystep 

  return ygrid*4 + xgrid

def create_grid_df(df):
  step_df= df[df["action"]!='turned 45 degrees right']
  step_df =step_df.loc[(step_df['item'].shift(-1) != step_df['item']) 
                          | (step_df['tangram nr'].shift(-1) != step_df['tangram nr'])
                          ].reset_index(drop=True) 

  step_df['step'] = step_df.groupby((step_df['tangram nr'] != step_df['tangram nr'].shift(1)).cumsum()).cumcount()+1
#   steps=15
#   step_df = step_df[step_df['step'] <steps+1] 

  pos = step_df['position'].tolist()
  pos=[eval(x) for x in pos]


  step_df[['x','y','rot']] = pd.DataFrame(pos, index=step_df.index)
  step_df = step_df.drop(['position','time','action'], axis=1)

  #fix wrong logs
  diff_logs = [102957,104401,102644,100444,92356,93250]
  step_df['y'] = step_df.apply(lambda x: x['y']-70 if (x['tangram nr']==4 and x['sid'] in diff_logs)  else x['y'], axis=1)

  step_df['grid_val']=step_df.apply(lambda x: get_grid_value(x['x'],x['y'],x['tangram nr']),axis=1)
  return step_df

def current_position(df,last_step):

  df=df.loc[df['step']<last_step+1] ##because starts at 1
  position={'small triangle':[],'middle triangle':[],'big triangle':[],'square':[],'parallelogram':[]}
  
  for p in df['item'].unique():
    pos= df.loc[df['item']==p].iloc[-1]
    
    #the triangle name does not matter 
    if p == 'big triangle 1' or p == 'big triangle 2':
      p = 'big triangle'
    if p == 'small triangle 1' or p == 'small triangle 2':
      p = 'small triangle'

    position.get(p).append((pos['grid_val'],pos['rot']))
    #position.get(p).append((pos['x'],pos['y'],pos['rot']))
    #position.get(p).append((pos['x'],pos['y']))
  return position


In [21]:
dir_logs = './logs'
logs =[]

    


for f in os.listdir(dir_logs):
  if f=='old_logs':
    continue
  file = dir_logs + '/' + f
  
  if not "reformated" in file:
    sid = int(f.strip("_log.csv"))
    logs.append(process_log(file, sid)) 

moves_df= create_moves_dataframe(logs)
sol_df = create_solution_dataframe(logs)

  return func(*args, **kwargs)


# states_df #
position of each piece at move 4,8,12,16

In [94]:
step_df = create_grid_df(moves_df)

puzzles = [1,2,3,4]
players = step_df['sid'].unique()

states_df =pd.DataFrame(columns=['sid','tangram nr','step','small triangle','middle triangle','big triangle','square','parallelogram'])



for player in players:
  for pzn in puzzles:
    partial  = step_df.loc[(step_df['tangram nr']==pzn) & (step_df['sid']==player)]
    for step in [4,8,12,16]:
    #for step in range(1,17):
        pos = current_position(partial,step)
      
        
      #print(pos)
        row = {'sid':player, 'tangram nr': pzn, 'step':step, 'small triangle':pos.get('small triangle'),
                 'middle triangle': pos.get('middle triangle'), 'big triangle' : pos.get('big triangle'),
                 'square': pos.get('square'), 'parallelogram': pos.get('parallelogram')}
        states_df = states_df.append(row, ignore_index =True)

# Creating the frequency datasets #


In [101]:
grid_df = create_grid_df(moves_df)

In [96]:
grid_df['rot']=grid_df.apply(lambda x: x['rot'] % 90 if x['item'] == 'square' else x['rot'], axis =1)
grid_df['rot']=grid_df.apply(lambda x: x['rot'] % 180 if x['item'] == 'parallelogram' else x['rot'], axis =1)
#grid_df['item']=grid_df.apply(lambda x: 'big triangle' if 'big triangle'in x['item']else x['item'], axis =1)
#grid_df['item']=grid_df.apply(lambda x: 'small triangle' if 'small triangle' in x['item'] else x['item'], axis =1)
di = {'small triangle 1':'SMALL-T','small triangle 2':'SMALL-T','middle triangle':'MIDDLE-T',
      'big triangle 1':'BIG-T','big triangle 2':'BIG-T','square':'SQUARE','parallelogram':'PARALL'}
grid_df['item']=grid_df['item'].apply(lambda x: di.get(x) )

In [97]:
### landmark counts: number of times a certain action is been taken between step 0 and phase ###
grid_df.to_csv('./datasets/steps.csv')

for phase in [4,8,12,16]:
    sums = grid_df.loc[grid_df.step<=phase].groupby(['tangram nr','item','grid_val','rot']).size().reset_index(name='counts')
    sums.to_csv(f'./datasets/landmark_counts_{phase}.csv')

In [111]:
### landmark str: (relative) number of times a certain action is been taken between step phase-4 and phase ###
gdf = pd.read_csv('./datasets/steps.csv')
gdf = gdf[~gdf.grid_val.isin([-1,0,3,4,12,13])]
sums=[]
for phase in [5,9,13,17]:
    rsums = pd.DataFrame({'counts':gdf.loc[(gdf['step']>=phase-4) & (gdf['step']<phase)].
                        groupby(['tangram nr','item','grid_val','rot']).size()}).reset_index()
    rsums['strength'] = rsums.counts/rsums.groupby('tangram nr').counts.transform('sum')
    rsums.to_csv(f'./datasets/landmark_str_{phase}.csv')



In [112]:
rsums

Unnamed: 0,tangram nr,item,grid_val,rot,counts,strength
0,1,BIG-T,5.0,45.0,3,0.049180
1,1,BIG-T,5.0,135.0,2,0.032787
2,1,BIG-T,5.0,315.0,2,0.032787
3,1,BIG-T,6.0,225.0,2,0.032787
4,1,BIG-T,9.0,0.0,4,0.065574
...,...,...,...,...,...,...
93,4,SMALL-T,5.0,270.0,1,0.023256
94,4,SMALL-T,8.0,0.0,1,0.023256
95,4,SMALL-T,15.0,0.0,4,0.093023
96,4,SQUARE,2.0,0.0,1,0.023256


In [115]:

pieces = step_df.item.unique()
last_positions_df = pd.DataFrame(columns=['sid','tangram nr','step','item','grid_val','rot'])
cnt =0
for player in players:
  for pzn in puzzles:
    for step in [4,8,12,16]:
        partial  = step_df.loc[(step_df['tangram nr']==pzn) & (step_df['sid']==player) & (step_df['step']<step+1)]
        
        for p in pieces:
            
            #find last position
            piece_rows= partial.loc[partial['item']==p]
            if p == 'big triangle 1' or p == 'big triangle 2':
              p = 'big triangle'
            if p == 'small triangle 1' or p == 'small triangle 2':
              p = 'small triangle'
            if not piece_rows.empty:
                cnt+=1
                item_pos = piece_rows.iloc[-1]
                piece_row = {'sid':player, 'tangram nr': pzn, 'step': step, 'item':p, 'grid_val': item_pos['grid_val'],
                             'rot': item_pos['rot']}
            else:
                cnt+=1
                piece_row = {'sid':player, 'tangram nr': pzn, 'step': step, 'item':p, 'grid_val': -1,
                             'rot': 0}
            last_positions_df= last_positions_df.append(piece_row, ignore_index = True)


In [120]:
gdf = last_positions_df[~last_positions_df.grid_val.isin([-1,0,3,4,12,13])]
di = {'small triangle':'SMALL-T','middle triangle':'MIDDLE-T',
      'big triangle':'BIG-T','square':'SQUARE','parallelogram':'PARALL'}
gdf['item']=gdf['item'].apply(lambda x: di.get(x) )
sums=[]
for phase in [5,9,13,17]:
    rsums = pd.DataFrame({'counts':gdf.loc[(gdf['step']>=phase-4) & (gdf['step']<phase)].
                        groupby(['tangram nr','item','grid_val','rot']).size()}).reset_index()
    rsums['strength'] = rsums.counts/rsums.groupby('tangram nr').counts.transform('sum')
    #sums.append(rsums)
    rsums.to_csv(f'./datasets/landmark_str_{phase}.csv')

In [121]:
rsums.loc[rsums['tangram nr'] ==4].sort_values(by='strength', ascending = False).head(10)

Unnamed: 0,tangram nr,item,grid_val,rot,counts,strength
126,4,SMALL-T,15.0,0.0,29,0.161111
129,4,SQUARE,15.0,90.0,25,0.138889
112,4,BIG-T,10.0,135.0,20,0.111111
115,4,MIDDLE-T,6.0,225.0,19,0.105556
108,4,BIG-T,6.0,45.0,19,0.105556
117,4,PARALL,9.0,90.0,13,0.072222
121,4,SMALL-T,1.0,135.0,13,0.072222
123,4,SMALL-T,2.0,135.0,12,0.066667
119,4,PARALL,9.0,270.0,10,0.055556
113,4,BIG-T,11.0,270.0,3,0.016667
