In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
os.chdir('../../')
os.getcwd()

'/Users/xehu/Desktop/Team Process Mapping/team-process-map/feature_engine'

In [3]:
input_data = pd.read_csv("./tpm-data/cleaned_data/test_data/multi_task_TINY_cols_renamed.csv", encoding='utf-8')

# Package Task 1 Tests
---
## "Case 1": Basic Test
In this test, we simply test the functionaality of everything after we rename everything ("Case 1").
Here, we use a test dataset that has a different conversation ID, speaker ID, message column, and timestamp
column compared to the defaults, and ensure that nothing breaks.

## "Case 2": .ngroup() feature
- Group by ["gameId", "roundId", "stageId"] and assert that the number of groupings matches
	the stageId (which will confirm that it worked)

## "Case 3": Complex hieararchical grouping
- (3a) ID: stageID; cumulative: True, within_task: False
- (3b) ID: stageID; cumulative: True; within_task: True
- (3c) ID: roundID; cumulative: True, within_task: True

## Improper examples:
- grouping keys: ["roundID", "stageID"], ID: "gameID"

# Testing dataset outpts paths:
1. Case 1 (output/chat/tiny_multi_task_PT1_level_chat.csv)
2. Case 2 (output/chat/tiny_multi_task_case2_level_chat.csv)
3. Case 3a (output/chat/tiny_multi_task_case3a_level_chat.csv)
4. Case 3b (output/chat/tiny_multi_task_case3b_level_chat.csv)
5. Case 3c (output/chat/tiny_multi_task_case3c_level_chat.csv)
6. Improper Case (output/chat/tiny_multi_task_improper_level_chat.csv)

In [4]:
case1_chatdf = pd.read_csv("output/chat/tiny_multi_task_PT1_level_chat.csv")
case2_chatdf = pd.read_csv("output/chat/tiny_multi_task_case2_level_chat.csv")
case3a_chatdf = pd.read_csv("output/chat/tiny_multi_task_case3a_level_chat.csv")
case3b_chatdf = pd.read_csv("output/chat/tiny_multi_task_case3b_level_chat.csv")
case3c_chatdf = pd.read_csv("output/chat/tiny_multi_task_case3c_level_chat.csv")
impropercase_chatdf = pd.read_csv("output/chat/tiny_multi_task_improper_level_chat.csv")

In [5]:
# Case 1 should have the same number of rows as the input df
assert(input_data.shape[0] == case1_chatdf.shape[0])

In [6]:
# Case 2 should have the same number of unique items between "conversation_num" and "stageId"
assert(len(case2_chatdf["conversation_num"].drop_duplicates()) == len(case2_chatdf["stageId"].drop_duplicates()))

In [29]:
# Case 3

# 3a
case_3a_rowcounts = pd.DataFrame(case3a_chatdf.groupby("conversation_num")["speakerId"].count()).reset_index()
case_3a_rowcounts = case_3a_rowcounts.rename(columns={"conversation_num": "stageId"})
input_data_rowcounts = pd.DataFrame(input_data.groupby("stageId")["speakerId"].count()).reset_index()

# 3b
case_3b_rowcounts = pd.DataFrame(case3b_chatdf.groupby("conversation_num")["speakerId"].count()).reset_index()
case_3b_rowcounts = case_3b_rowcounts.rename(columns={"conversation_num": "stageId"})

case3a_orig_comparison = case_3a_rowcounts.merge(input_data_rowcounts, how = "inner", on = "stageId").rename(columns={"speakerId_x": "3a", "speakerId_y": "orig"})
case3ab_orig_comparison = case_3b_rowcounts.merge(case3a_orig_comparison, how = "inner", on = "stageId").rename(columns={"speakerId": "3b"})

In [32]:
# assert that conversations can only get longer if we group cumulatively
assert(len(case3ab_orig_comparison[case3ab_orig_comparison["3a"] < case3ab_orig_comparison["orig"]]) == 0)
assert(len(case3ab_orig_comparison[case3ab_orig_comparison["3b"] < case3ab_orig_comparison["orig"]]) == 0)

# assert that we can only get more rows if we don't do within_task
assert(len(case3ab_orig_comparison[case3ab_orig_comparison["3b"] > case3ab_orig_comparison["3a"]]) == 0)

In [30]:
# these are cases where there are more rows in 3a than in the original
case3ab_orig_comparison[case3ab_orig_comparison["3a"] > case3ab_orig_comparison["orig"]]

Unnamed: 0,stageId,3b,3a,orig
4,2miEeHyhqbADjavqY,3,3,1
9,3KJZW48dsgsmmgFBt,1,3,1
11,3cbhbgcm4TXNg52cv,3,3,2
20,5KnwtyEYMEj2gTPpc,3,3,2
25,6Dhd7u2SdjnkisPAu,3,3,2
26,6STNHx8ibjtjr9tP5,3,3,2
27,6awahG5Rr6cdGDtbz,3,3,1
31,7rS2GuJFNHPGWatr3,2,2,1
35,98Qj2otg5YARvkRrX,3,3,2
36,9AfsZa9t83f8vHkLR,1,3,1


In [14]:
input_data[input_data["stageId"] == "yZEpzgxumLFMEaY4T"]

Unnamed: 0,stageId,roundId,gameId,text,speakerId,time,task,complexity,playerCount,score,...,income_min_nanmean,income_min_nanstd,marital_status_nanmean,marital_status_nanstd,political_fiscal_nanmean,political_fiscal_nanstd,political_party_nanmean,political_party_nanstd,political_social_nanmean,political_social_nanstd
246,yZEpzgxumLFMEaY4T,327x9knxnsj4aaPjD,mjehBbrezMnafKtPB,goat on one side and just approve,6gDK5p7bsPRf68Z2L,2023-09-06T20:24:45.733Z,Wolf Goat Cabbage,Low,3,0.0,...,15000.333333,18028.311078,1.333333,2.309401,0.433333,0.057735,0.0,0.0,0.377778,0.03849


In [15]:
case3a_chatdf[case3a_chatdf["conversation_num"] == "yZEpzgxumLFMEaY4T"]

Unnamed: 0,stageId,roundId,gameId,text,speakerId,time,task,complexity,playerCount,score,...,num_reddit_users,num_emphasis,num_bullet_points,num_numbered_points,num_line_breaks,num_quotes,num_block_quote_responses,num_ellipses,num_parentheses,num_emoji
292,AdTkoEzcrhAW2kSKN,erxikJNPiBJorke9k,mjehBbrezMnafKtPB,yes,E2sMtM8sH5BzyEa36,2023-09-06 19:51:15.244000+00:00,Guess the Correlation,Low,3,95.0,...,0,0,0,0,1,0,0,0,0,0
293,AdTkoEzcrhAW2kSKN,erxikJNPiBJorke9k,mjehBbrezMnafKtPB,this one close,E2sMtM8sH5BzyEa36,2023-09-06 19:51:56.312000+00:00,Guess the Correlation,Low,3,95.0,...,0,0,0,0,1,0,0,0,0,0
294,yZEpzgxumLFMEaY4T,327x9knxnsj4aaPjD,mjehBbrezMnafKtPB,goat on one side and just approve,6gDK5p7bsPRf68Z2L,2023-09-06 20:24:45.733000+00:00,Wolf Goat Cabbage,Low,3,0.0,...,0,0,0,0,1,0,0,0,0,0


Assert that the within_task flag is working

In [42]:
for conversation_id in case3b_chatdf["conversation_num"].unique():
    # get all chats with this id
    conversation = case3b_chatdf[case3b_chatdf["conversation_num"] == conversation_id]
    assert(len(conversation["task"].unique())==1)

Look at Case 3c

In [50]:
# 3c
case_3c_rowcounts = pd.DataFrame(case3c_chatdf.groupby("conversation_num")["speakerId"].count()).reset_index()
case_3c_rowcounts = case_3c_rowcounts.rename(columns={"conversation_num": "roundId"})
input_data_rowcounts_by_roundId = pd.DataFrame(input_data.groupby("roundId")["speakerId"].count()).reset_index()

case3c_orig_comparison = case_3c_rowcounts.merge(input_data_rowcounts_by_roundId, how = "inner", on = "roundId").rename(columns={"speakerId_x": "3c", "speakerId_y": "orig"})

# First assert that we properly grouped by the roundId (Mid-level grouper)
assert(len(case_3c_rowcounts["roundId"].unique()) == len(input_data["roundId"].unique()))

In [53]:
# assert that conversations can only get longer if we group cumulatively
assert(len(case3c_orig_comparison[case3c_orig_comparison["3c"] < case3c_orig_comparison["orig"]]) == 0)

# these are the cases where we actually included something from before
case3c_orig_comparison[case3c_orig_comparison["3c"] > case3c_orig_comparison["orig"]]

Unnamed: 0,roundId,3c,orig
8,327x9knxnsj4aaPjD,3,1
14,3ru6CFsBtsCXGjHEA,3,1
23,4pcwMy5zkgvG45Dfo,3,1
27,5e8D85icsctQgC2xj,3,1
34,7vur2erkL4Pn7pe4C,3,1
39,8wwztN3irnpuroHGF,2,1
81,LTkkZ3dJv8Bq9rC7u,2,1
85,Mi8zemJnNPZkJQe28,3,2
86,Mm9GKS2BR6W35h9FC,3,2
93,SG9ZELW2qakLMFzhi,2,1


In [55]:
case3c_chatdf[case3c_chatdf["conversation_num"]=="327x9knxnsj4aaPjD"]

Unnamed: 0,stageId,roundId,gameId,text,speakerId,time,task,complexity,playerCount,score,...,num_reddit_users,num_emphasis,num_bullet_points,num_numbered_points,num_line_breaks,num_quotes,num_block_quote_responses,num_ellipses,num_parentheses,num_emoji
265,AdTkoEzcrhAW2kSKN,erxikJNPiBJorke9k,mjehBbrezMnafKtPB,yes,E2sMtM8sH5BzyEa36,2023-09-06 19:51:15.244000+00:00,Guess the Correlation,Low,3,95.0,...,0,0,0,0,1,0,0,0,0,0
266,AdTkoEzcrhAW2kSKN,erxikJNPiBJorke9k,mjehBbrezMnafKtPB,this one close,E2sMtM8sH5BzyEa36,2023-09-06 19:51:56.312000+00:00,Guess the Correlation,Low,3,95.0,...,0,0,0,0,1,0,0,0,0,0
267,yZEpzgxumLFMEaY4T,327x9knxnsj4aaPjD,mjehBbrezMnafKtPB,goat on one side and just approve,6gDK5p7bsPRf68Z2L,2023-09-06 20:24:45.733000+00:00,Wolf Goat Cabbage,Low,3,0.0,...,0,0,0,0,1,0,0,0,0,0


Assert that we treat the improper case exactly as we treat Case 2

In [41]:
assert(impropercase_chatdf.shape == case2_chatdf.shape)
assert(impropercase_chatdf["conversation_num"].equals(case2_chatdf["conversation_num"]))