Data Analysis For NCTE

In [11]:
import pandas as pd
import os

In [6]:
ncte_single_utterances = pd.read_csv('data/ncte_single_utterances.csv')  # ['speaker', 'text', 'year', 'OBSID', 'video_id', 'cleaned_text', 'num_words', 'turn_idx', 'comb_idx']
paired = pd.read_csv('data/paired_annotations.csv') # ['exchange_idx', 'OBSID', 'student_text', 'teacher_text', 'student_on_task', 'teacher_on_task', 'high_uptake', 'focusing_question']
reasoning = pd.read_csv('data/student_reasoning.csv') #['comb_idx', 'OBSID', 'NCTETID', 'text', 'student_reasoning', 'annotator_comment']

# This code will take a df that contains all the utterances from the same classroom session, sort the utterances by a column indicating utterance order, and print each utterance as: [speaker_name]: [utterance]\n.
def construct_transcript_from_df(transcript_df, sort_by, speaker_column, text_column, print_transcript=False):
    transcript_text = ""
    # Sort the transcript by the sort_by column
    transcript_df = transcript_df.sort_values(by=sort_by)
    # Add to the transcript_text as `speaker: text` for each row in the dataframe
    for index, row in transcript_df.iterrows():
        transcript_text += row[speaker_column] + ": " + row[text_column] + "\n"
    # Print the transcript text if print_transcript is True
    if print_transcript:
        print(transcript_text)
    return transcript_text

transcript_id = sorted(list(ncte_single_utterances['OBSID'].unique()))[0] # 3transcript_id = sorted(list(ncte_single_utterances['OBSID'].unique()))[0] # 3 

transcript_df = ncte_single_utterances[ncte_single_utterances.OBSID==transcript_id]
transcript_text = construct_transcript_from_df(
    transcript_df=transcript_df,
    sort_by='turn_idx',
    speaker_column='speaker',
    text_column='text',
    print_transcript=False
)

print(f"Transcript:\n{transcript_text}\n")

Transcript:
teacher: Okay.  Math should be out.  Everything else, please put away. I need Student E, Student H over here.  Student B, on the floor.  You can have your math packets.  And I need Student J over here and Student O over here.  [Inaudible.] All right.    [Inaudible] no glasses.  Student D, I need you to completely turn around a little since you’re not used to sitting there like that, in that seat.  Okay. Let’s turn to page 53.  Today we’re going to work on seeing what happens, determine the effect.  See what happens to the product when a factor is doubled or it’s halved.  We’re going to talk some more about that like we did last week. And we’re going to develop more strategies for multiplying that involves breaking numbers [inaudible].  When we do that, of course we’re always going to be looking at demonstrating, explaining your strategies for when you’re multiplying or dividing.  Okay.  So math, page 53.  We have this problem on that page.  16 times 3 and 16 times 6. Let’s 

In [34]:
# Before performing any analysis on the transcript text, we perform some analysis based on the column information already provided in the dataframe! This is good practice to get a sense of numbers and scale in a new dataset.

# For example, we can look at the number of total transcripts, and number of transcripts per school year..
# Count total number of transcripts

import matplotlib.pyplot as plt
total_transcripts = len(ncte_single_utterances['OBSID'].unique())
print(f"Total number of transcripts: {total_transcripts}")
# Count number of unique `OBSID`s per `year`
transcripts_per_year = ncte_single_utterances.groupby('year')['OBSID'].nunique()
print(f"Transcripts per year:\n{transcripts_per_year}")
# Average number of utterance turns `turn_idx` across transcripts `OBSID`
avg_turns = ncte_single_utterances.groupby('OBSID')['turn_idx'].max().mean()
print(f"Average number of utterance turns across transcripts: {avg_turns:.2f}")



# Average number of utterance turns `turn_idx` per speaker `speaker` across all transcripts `OBSID`:
unique_speakers = ncte_single_utterances['speaker'].unique()
for speaker in unique_speakers:
    if pd.isnull(speaker):
        continue
    speaker_df = ncte_single_utterances[ncte_single_utterances['speaker'] == speaker]
    # Count number of turns per transcript
    turns_df = speaker_df.groupby('OBSID').size()
    # Average the number of turns across transcripts
    avg_turns_per_speaker = turns_df.mean()
    print(f"Average # turns for speaker {speaker}: {avg_turns_per_speaker}, percentage of turns: {round(avg_turns_per_speaker/avg_turns * 100, 2)} %")


avg_obs_words = ncte_single_utterances.groupby('OBSID')['num_words'].sum().mean()
print(f"Average number of words across transcripts: {avg_words:.2f}")



# Average number of words per speaker
for speaker in unique_speakers:
    if pd.isnull(speaker):
        continue
    speaker_df = ncte_single_utterances[ncte_single_utterances['speaker'] == speaker]
    turns_df = speaker_df.groupby('OBSID')['num_words'].sum()
    print(f"speaker: {speaker}, average words: {turns_df.mean()}, percentage of words: {round(turns_df.mean()/avg_obs_words * 100, 2)} %")
    avg_words = speaker_df['num_words'].mean()
    print(f"Average # words for {speaker}: {avg_words:.2f}")


Total number of transcripts: 1660
Transcripts per year:
year
1    697
2    616
3    347
Name: OBSID, dtype: int64
Average number of utterance turns across transcripts: 348.64
Average # turns for speaker teacher: 172.62710843373495, percentage of turns: 49.51 %
Average # turns for speaker student: 154.05431502715751, percentage of turns: 44.19 %
Average # turns for speaker multiple students: 26.52407152682256, percentage of turns: 7.61 %
Average number of words across transcripts: 1.61
speaker: teacher, average words: 5025.433132530121, percentage of words: 87.64 %
Average # words for teacher: 29.11
speaker: student, average words: 672.4315027157513, percentage of words: 11.73 %
Average # words for student: 4.36
speaker: multiple students, average words: 42.59697386519945, percentage of words: 0.74 %
Average # words for multiple students: 1.61


Things to think about:

The "average # turns" cell shows that the teacher and students have similar number of turns across transcripts, i.e., ~239 teacher utterances compared to ~211 student utterances.
However, the average # of words indicates a noticeable difference in their utterance lengths, i.e., ~29 words in the teachers' utterances compared to ~4 words in the students' utterances.


Manually checking the (quality of the) text
Before even running any kind of "smart" analysis methods on the text, it's always good to manually look at examples. Previously we printed out an example of a transcript. Here, we are going to print out a few examples of the teacher utterances and student utterances.


In [9]:
# Helpful utility function!
def print_line_separated(df, speaker_column, text_column):
    for speaker, text in zip(df[speaker_column], df[text_column]):
        print(f"{speaker}: {text}")
NUM_SAMPLES = 15
# Seed the random number generator to get the same results
import numpy as np
np.random.seed(42)

# Get NUM_SAMPLES of the teacher utterances
teacher_df = ncte_single_utterances[ncte_single_utterances['speaker'] == 'teacher'].sample(NUM_SAMPLES)
print("Teacher utterances:")
print_line_separated(df=teacher_df, speaker_column='speaker', text_column='text')
print()

# Get NUM_SAMPLES of the student utterances
student_df = ncte_single_utterances[ncte_single_utterances['speaker'] == 'student'].sample(NUM_SAMPLES)
print("Student utterances:")
print_line_separated(df=student_df, speaker_column='speaker', text_column='text')

Teacher utterances:
teacher: Okay, I can double it by doing 15 times 4?  Does everybody agree with that?  So this is doubling it?
teacher: Okay, go get paper, pencils.  This group, you're gonna work over here together.
teacher: How they've shown it on here, every part of what they did they're able to show on here and you're able to see, right. That's the way it needs to be when you're showing your work. Let's see.  Student A, come here.  You can come up.  Can you tell us how this side works?
teacher: Because that’s not really modeling the tenth.  It’s all got to be [inaudible].  Nope.
teacher: Student B's answering.
teacher: All your pages are filled up?
teacher: I love what you just said.  Did everyone hear that?
teacher: Keep going.
teacher: Add a 0.  Now let's look at the numbers.  The percent – does that make sense?  Are these two numbers pretty close to each other?
teacher: To ten, right?
teacher: That’s a good strategy.  So you’re imagining – one second. You’re imagining the two 

It's always good to make observations about the type of texts or patterns in the data.

Things to think about: Manually looking at the teacher utterances, we notice a few things.

1. Some of the utterances are math content related (e.g., "Add a 0. Now let's look at the numbers. The percent – does that make sense? Are these two numbers pretty close to each other?") and others are not (e.g., "Okay, go get paper, pencils. This group, you're gonna work over here together.")
2. Some of the utterances are about class management such as calling everyone's attention, "I love what you just said. Did everyone hear that?"
3. Some of the utterances are about supporting the student's thinking e.g., "Okay, I can double it by doing 15 times 4? Does everybody agree with that? So this is doubling it?"


Things to think about: Manually looking at the student utterances, we notice a few things.

1. The utterances are shorter than the teacher's utterances.
2. The utterances seem to be short answers to questions e.g., "14.", "Yes.", "Subtract."

# Exploring the metadata

Now let's switch gears to explore the metadata folder data/ICPSR_36095! This folder contains many subfolders DS00##, each of which contains a tsv file and PDF documenting the contents of the tsv file. Each subfolder contains different types of metadata. For example, DS0001 contains the metadata and documentation for class observations, a rubric on the teacher's class management and behavior. Or, DS0006 contains metadata and documentation on a teacher background questionnaire.

In this section we're going to use DS0006 and compute the general statistics on the teacher's questionnaire metadata.

In particular, we are going to determine:

Number of teachers
- % Male
- % Black
- % Asian
- % Hispanic
- % White
- Avg number of years of teaching experience
- % BA in education?

In [22]:
# Load metadata
fpath = os.path.join('/Users/mobvoi/Downloads/ICPSR_36095/DS0006/36095-0006-Data.tsv')
teacher_metadata = pd.read_csv(fpath, sep='\t')

# Number of teachers
num_teachers = teacher_metadata['NCTETID'].nunique()
print(f"Number of teachers = {num_teachers}")

# Percentage Male
# Drop non 0/1 values
teacher_metadata = teacher_metadata[teacher_metadata['MALE'].isin([0,1])]
num_male = teacher_metadata['MALE'].sum()
perc_male = ( num_male / num_teachers ) * 100
# Round
perc_male = round(perc_male, 2)
print(f"Percentage male = {perc_male}")

# Percentage Black
num_black = teacher_metadata['BLACK'].sum()
perc_black = ( num_black / num_teachers ) * 100
perc_black = round(perc_black, 2)
print(f"Percentage black = {perc_black}")

# Percentage Asian
num_asian = teacher_metadata['ASIAN'].sum()
perc_asian = ( num_asian / num_teachers ) * 100
perc_asian = round(perc_asian, 2)
print(f"Percentage asian = {perc_asian}")

# Percentage Hispanic
num_hispanic = teacher_metadata['HISP'].sum()
perc_hispanic = ( num_hispanic / num_teachers ) * 100
perc_hispanic = round(perc_hispanic, 2)
print(f"Percentage hispanic = {perc_hispanic}")

# Percentage White
num_white = teacher_metadata['WHITE'].sum()
perc_white = ( num_white / num_teachers ) * 100
perc_white = round(perc_white, 2)
print(f"Percentage white = {perc_white}")

# Average number of years of teaching experience
# Drop empty strings and ' '
teacher_metadata = teacher_metadata[teacher_metadata['EXPERIENCE'] != ' ']
# Make sure it's a float
teacher_metadata['EXPERIENCE'] = teacher_metadata['EXPERIENCE'].astype(float)
avg_years = teacher_metadata['EXPERIENCE'].mean()
avg_years = round(avg_years, 2)
print(f"Average number of years of teaching experience = {avg_years}")

# BA in education
num_ba = teacher_metadata['EDBACHELORS'].sum()
perc_ba = ( num_ba / num_teachers ) * 100
perc_ba = round(perc_ba, 2)
print(f"Percentage with BA in education = {perc_ba}")


# MS in education
num_ms = teacher_metadata['MASTERS'].sum()
perc_ms = ( num_ms / num_teachers ) * 100
perc_ms = round(perc_ms, 2)
print(f"Percentage with MS in education = {perc_ms}")


Number of teachers = 313
Percentage male = 16.29
Percentage black = 22.04
Percentage asian = 2.56
Percentage hispanic = 2.88
Percentage white = 64.22
Average number of years of teaching experience = 10.09
Percentage with BA in education = 52.08
Percentage with MS in education = 74.12


In [21]:
# Get the dataset of D5, 
ds5_file="/Users/mobvoi/Downloads/ICPSR_36095/DS0005/36095-0005-Data.tsv"
import pandas as pd
student_metadata = pd.read_csv(ds5_file, sep='\t', low_memory=False)


# Number of students
num_students = student_metadata['NCTESID'].nunique()
print(f"Number of unique students = {num_students}")
# TEST_GRADE precentage
for grade in student_metadata['TEST_GRADE'].unique():
    # print(student_metadata[student_metadata['TEST_GRADE'] == grade].shape[0])
    print(f"Percentage of {grade} = {student_metadata[student_metadata['TEST_GRADE'] == grade].shape[0] / len(student_metadata) * 100}")

# Percentage Male
# Drop non 0/1 values
# print(student_metadata['S_MALE'].value_counts())
student_metadata =  student_metadata[student_metadata['S_MALE'] != ' ']
student_metadata['S_MALE'] = student_metadata['S_MALE'].astype(int)
student_metadata = student_metadata[student_metadata['S_MALE'].isin([0,1])]
print(f'Value counts of student_metadata = {len(student_metadata)}')
num_male = student_metadata['S_MALE'].sum()
perc_male = ( num_male / len(student_metadata) ) * 100
# Round
perc_male = round(perc_male, 2)
print(f"Percentage male = {perc_male}")

# Percentage African American
student_metadata['S_AFAM'] = student_metadata['S_AFAM'].astype(int)
num_african_american = student_metadata['S_AFAM'].sum()
perc_african_american = ( num_african_american / len(student_metadata) ) * 100
perc_african_american = round(perc_african_american, 2)
print(f"Percentage african american = {perc_african_american}")

# Percentage Asian
student_metadata['S_ASIAN'] = student_metadata['S_ASIAN'].astype(int)
num_asian = student_metadata['S_ASIAN'].sum()
perc_asian = ( num_asian / len(student_metadata) ) * 100
perc_asian = round(perc_asian, 2)
print(f"Percentage asian = {perc_asian}")

# Percentage Hispanic/Latinx
student_metadata['S_HISP'] = student_metadata['S_HISP'].astype(int)
num_hispanic = student_metadata['S_HISP'].sum()
perc_hispanic = ( num_hispanic / len(student_metadata) ) * 100
perc_hispanic = round(perc_hispanic, 2)
print(f"Percentage hispanic = {perc_hispanic}")

# Percentage White
student_metadata['S_WHITE'] = student_metadata['S_WHITE'].astype(int)
num_white = student_metadata['S_WHITE'].sum()
perc_white = ( num_white / len(student_metadata) ) * 100
perc_white = round(perc_white, 2)
print(f"Percentage white = {perc_white}")

# Percentage Free/Reduced Lunch
student_metadata =  student_metadata[student_metadata['S_FRPL'] != ' ']
print(f'Value counts of student_metadata = {len(student_metadata)}')

student_metadata['S_FRPL'] = student_metadata['S_FRPL'].astype(int)
num_free_reduced_lunch = student_metadata['S_FRPL'].sum()
perc_free_reduced_lunch = ( num_free_reduced_lunch / len(student_metadata) ) * 100
perc_free_reduced_lunch = round(perc_free_reduced_lunch, 2)
print(f"Percentage free/reduced lunch = {perc_free_reduced_lunch}")
# Percentage Special Education
student_metadata['S_SPED'] = student_metadata['S_SPED'].astype(int)
print(f'Value counts of student_metadata = {len(student_metadata)}')
print(student_metadata['S_SPED'].value_counts())
num_special_education = student_metadata['S_SPED'].sum()
perc_special_education = ( num_special_education / len(student_metadata) ) * 100
perc_special_education = round(perc_special_education, 2)
print(f"Percentage special education = {perc_special_education}")

# Percentage Limited English Proficiency
student_metadata['S_LEP'] = student_metadata['S_LEP'].astype(int)
num_limited_english_proficiency = student_metadata['S_LEP'].sum()
perc_limited_english_proficiency = ( num_limited_english_proficiency / len(student_metadata) ) * 100
perc_limited_english_proficiency = round(perc_limited_english_proficiency, 2)
print(f"Percentage limited english proficiency = {perc_limited_english_proficiency}")




Number of unique students = 10955
Percentage of 4 = 49.89337335123608
Percentage of   = 2.574836110891715
Percentage of 5 = 47.531790537872205
Value counts of student_metadata = 12535
Percentage male = 49.92
Percentage african american = 42.77
Percentage asian = 7.49
Percentage hispanic = 22.86
Percentage white = 22.9
Value counts of student_metadata = 12514
Percentage free/reduced lunch = 65.67
Value counts of student_metadata = 12514
S_SPED
0    10952
1     1562
Name: count, dtype: int64
Percentage special education = 12.48
Percentage limited english proficiency = 19.97
