# Time Chunking Utility Development Python Notebook

In [88]:
import pandas as pd
import numpy as np


In [89]:
juries = pd.read_csv('../data/raw_data/juries_tiny_for_testing.csv')
juries

Unnamed: 0,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes
0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,Hello!,1.0,1,0.333333,3
1,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,Hi!,1.0,1,0.333333,3
2,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,Hello,1.0,1,0.333333,3
3,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20T18:27:30.410Z,Hi,1.0,1,0.333333,3
4,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20T18:27:35.506Z,hi,1.0,1,0.333333,3
...,...,...,...,...,...,...,...,...,...,...
92,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:02:55.111Z,I say asshole under stress,0.6,0,0.000000,5
93,0,2,5d6feec65f80ae21f5c5f054,conventionalMonkey,2020-04-20T19:03:21.819Z,"Yes, she is the asshole... unfortunately. Husb...",0.6,0,0.000000,5
94,0,2,5d482ea421c9be351f762255,newPanda,2020-04-20T19:03:36.308Z,I think she is being presumptuous and acting l...,0.6,0,0.000000,5
95,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20T19:03:53.219Z,"Tha's true, she ins't considering her husband ...",0.6,0,0.000000,5


In [90]:
def create_chunks(df,num_chunks):

    #check if there are timestamps
    final_df = pd.DataFrame(columns=df.columns)

    for index, conv in df.groupby(['batch_num', 'round_num']):

        # Convert timestamp column to DateTime format
        conv['timestamp'] = pd.to_datetime(conv['timestamp'])

        # Calculate the total duration of the conversation
        total_duration = (conv['timestamp'].max() - conv['timestamp'].min()).total_seconds()
        # total_duration = int(df['duration'][0])

        # Calculate the duration of each chunk
        chunk_duration = total_duration / num_chunks

        if chunk_duration == 0:
            chunk_duration = 1

        # Add a new column for chunk number
        conv['chunk'] = -1 

        # Assign the chunk number for each row
        for index, row in conv.iterrows():
            #get the timestamp 
            timestamp = row['timestamp']

            #calculate the chunk number
            chunk_number = int(((timestamp - conv['timestamp'].min())).total_seconds() / chunk_duration)

            #restrict the range of the chunks from 0 to num_chunks - 1
            if chunk_number >= num_chunks:
                conv.at[index, 'chunk'] = num_chunks - 1
            else:
                conv.at[index, 'chunk'] = chunk_number
        final_df = pd.concat([final_df, conv], ignore_index = True)
    
    return final_df


In [91]:
# final_df = pd.DataFrame(columns=juries.columns)

# for index, conv in juries.groupby(['batch_num', 'round_num']):
#     create_chunks(conv, 3)
#     final_df = pd.concat([final_df, conv], ignore_index = True)

In [92]:
df = create_chunks(juries, 3)
df

Unnamed: 0,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes,chunk
0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20 18:27:20.125000+00:00,Hello!,1.0,1,0.333333,3,0.0
1,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20 18:27:23.764000+00:00,Hi!,1.0,1,0.333333,3,0.0
2,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20 18:27:27.724000+00:00,Hello,1.0,1,0.333333,3,0.0
3,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20 18:27:30.410000+00:00,Hi,1.0,1,0.333333,3,0.0
4,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20 18:27:35.506000+00:00,hi,1.0,1,0.333333,3,0.0
...,...,...,...,...,...,...,...,...,...,...,...
92,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20 19:02:55.111000+00:00,I say asshole under stress,0.6,0,0.000000,5,2.0
93,0,2,5d6feec65f80ae21f5c5f054,conventionalMonkey,2020-04-20 19:03:21.819000+00:00,"Yes, she is the asshole... unfortunately. Husb...",0.6,0,0.000000,5,2.0
94,0,2,5d482ea421c9be351f762255,newPanda,2020-04-20 19:03:36.308000+00:00,I think she is being presumptuous and acting l...,0.6,0,0.000000,5,2.0
95,0,2,5e7e4f4c31f4e454e196c9c4,newLion,2020-04-20 19:03:53.219000+00:00,"Tha's true, she ins't considering her husband ...",0.6,0,0.000000,5,2.0


In [93]:
df.loc[(df['batch_num'] == 0) & (df['round_num'] == 0)]

Unnamed: 0,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes,chunk
0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20 18:27:20.125000+00:00,Hello!,1.0,1,0.333333,3,0.0
1,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20 18:27:23.764000+00:00,Hi!,1.0,1,0.333333,3,0.0
2,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20 18:27:27.724000+00:00,Hello,1.0,1,0.333333,3,0.0
3,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20 18:27:30.410000+00:00,Hi,1.0,1,0.333333,3,0.0
4,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20 18:27:35.506000+00:00,hi,1.0,1,0.333333,3,0.0
5,0,0,5d482ea621c9be351f762ae5,culturedBear,2020-04-20 18:28:09.451000+00:00,hello,1.0,1,0.333333,3,0.0
6,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20 18:28:20.136000+00:00,I don't think the guy is the a$$hole. Thoughts?,1.0,1,0.333333,3,0.0
7,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20 18:28:28.501000+00:00,So who thinks the guy is an ass for asking his...,1.0,1,0.333333,3,0.0
8,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20 18:28:30.530000+00:00,I think that this person is not an asshole bec...,1.0,1,0.333333,3,0.0
9,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20 18:28:30.637000+00:00,I can see how the family is upset because they...,1.0,1,0.333333,3,0.0
