In [2]:
import pandas as pd
from pathlib import Path
from pprint import pprint
from parse_technical_terms import get_structured_response

In [3]:
# Assuming your dataframe is called 'df'
# If you need to load it first:
# df = pd.read_csv('your_file.csv')

def group_rows(df, group_size=5):
    # Create group numbers (0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...)
    df['group'] = df.index // group_size
    
    # Group by the group number and aggregate
    grouped = df.groupby('group').agg({
        'timestamp': lambda x: f"{x.iloc[0]}-{x.iloc[-1]}", # First to last timestamp
        'text': ' '.join  # Join all text with spaces
    }).reset_index(drop=True)
    
    return grouped

In [4]:
data_path = Path.cwd() / 'data' / 'transcript.csv'

In [5]:
data_path.exists()

True

In [6]:
with open(data_path, mode='r') as file:
    df = pd.read_csv(file)

In [7]:
df.head()

Unnamed: 0,timestamp,text
0,0:00,what is your main bone of contention with Eric...
1,0:06,nothing but civil he's also extremely nasty so...
2,0:15,explanation for three generations of flavored ...
3,0:20,SU3 plus SU2 SU1 i have read Eric's paper here...
4,0:26,it's worse than you would think how dare you h...


In [8]:
grouped = group_rows(df)

In [9]:
df['text'].values[:10]

array(['what is your main bone of contention with Eric Weinstein uh you should probably ask Eric that sean has been',
       "nothing but civil he's also extremely nasty so let me say a bunch of things to Dr carol dr carol I'd like to hear your",
       'explanation for three generations of flavored chyro firmians with the observed quantum numbers under the group',
       "SU3 plus SU2 SU1 i have read Eric's paper here it is i actually have it here",
       "it's worse than you would think how dare you how dare I read your paper i highly advise you to spend more time in your",
       "physics department and less time on YouTube so we're not allowed to think about Eric's theory and write a",
       "follow-up paper about it and although you're very much Sean I have my disagreements with string theorists my",
       "agreements with them but I respect it and I think that they're trying their best the first rule of physics fight",
       "club is don't talk about the problems with physics

In [10]:
' '.join(df['text'].values[:10])

"what is your main bone of contention with Eric Weinstein uh you should probably ask Eric that sean has been nothing but civil he's also extremely nasty so let me say a bunch of things to Dr carol dr carol I'd like to hear your explanation for three generations of flavored chyro firmians with the observed quantum numbers under the group SU3 plus SU2 SU1 i have read Eric's paper here it is i actually have it here it's worse than you would think how dare you how dare I read your paper i highly advise you to spend more time in your physics department and less time on YouTube so we're not allowed to think about Eric's theory and write a follow-up paper about it and although you're very much Sean I have my disagreements with string theorists my agreements with them but I respect it and I think that they're trying their best the first rule of physics fight club is don't talk about the problems with physics fight club i have found myself in the awkward and unenviable position of defending the

In [16]:
topics = []

for _, row in grouped.iterrows():
    # # # print(row.text)
    # # print()
    # print("calling model")
    row_topics = get_structured_response(text_to_analyze=row.text)
    data = row_topics.model_dump()
    data['text'] = row.text
    data['timestamp'] = row.timestamp
    topics.append(data)

In [17]:
topics[0]

{'response': [{'concept_name': 'SU3 group',
   'explanation_of_concept': 'A Lie group used in particle physics to describe the strong nuclear force, specifically the symmetry of quarks and gluons.',
   'education_level': 'Graduate'},
  {'concept_name': 'SU2 SU1 groups',
   'explanation_of_concept': 'Subgroups of the SU3 group, describing the electroweak force and the strong nuclear force respectively.',
   'education_level': 'Graduate'},
  {'concept_name': 'Quantum numbers',
   'explanation_of_concept': 'Properties used to describe the behavior of subatomic particles, such as spin, charge, and isospin.',
   'education_level': 'Undergraduate'}],
 'text': "what is your main bone of contention with Eric Weinstein uh you should probably ask Eric that sean has been nothing but civil he's also extremely nasty so let me say a bunch of things to Dr carol dr carol I'd like to hear your explanation for three generations of flavored chyro firmians with the observed quantum numbers under the group

In [25]:
topics_df = pd.DataFrame(topics)

In [28]:
topics_df.head()

Unnamed: 0,response,text,timestamp
0,"[{'concept_name': 'SU3 group', 'explanation_of...",what is your main bone of contention with Eric...,0:00-0:26
1,"[{'concept_name': 'String Theory', 'explanatio...",physics department and less time on YouTube so...,0:32-0:56
2,"[{'concept_name': 'Materialism vs. Idealism', ...",uh aspect reminds me of you as the Maran Twine...,1:02-1:28
3,"[{'concept_name': 'Theory of Relativity', 'exp...",measured and predicted and then came Einstein ...,1:33-1:59
4,"[{'concept_name': 'Simulation Hypothesis', 'ex...",universes do we live in a matrix style simulat...,2:04-2:30


In [37]:
topics_df = topics_df.explode("response").reset_index()

In [43]:
topics_df = pd.concat([topics_df[["text", "timestamp"]],topics_df["response"].apply(pd.Series)], axis=1)

In [46]:
topics_df = topics_df.drop(columns=[0])

In [47]:
topics_df.head()

Unnamed: 0,text,timestamp,concept_name,explanation_of_concept,education_level
0,what is your main bone of contention with Eric...,0:00-0:26,SU3 group,A Lie group used in particle physics to descri...,Graduate
1,what is your main bone of contention with Eric...,0:00-0:26,SU2 SU1 groups,"Subgroups of the SU3 group, describing the ele...",Graduate
2,what is your main bone of contention with Eric...,0:00-0:26,Quantum numbers,Properties used to describe the behavior of su...,Undergraduate
3,physics department and less time on YouTube so...,0:32-0:56,String Theory,A theoretical framework in physics that attemp...,Undergraduate
4,physics department and less time on YouTube so...,0:32-0:56,General Relativity,A theory of gravitation developed by Albert Ei...,High School


In [48]:
topics_df['education_level'].value_counts()

education_level
Undergraduate                 112
Graduate                       77
High School                    69
High School/Undergraduate       2
Undergraduate/Graduate          2
Graduate/Professional           1
High School, Undergraduate      1
Name: count, dtype: int64

In [51]:
out_path = data_path.parent / 'processed_topics.csv'

In [52]:
with open(out_path, mode='w') as file:
    topics_df.to_csv(file, index=False)