<a href="https://colab.research.google.com/github/Selinalkan/GitPractice/blob/main/Collocations_UPDATED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Imports and Functions**

In [1]:
#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.collocations import *
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
import spacy
import string
stop = stopwords.words('english')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

#function to clean and lemmatize the text
def clean_text(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [4]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in stop:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

# Interviewer with Context

In [5]:
#Importing ER_c_Transcript Dataset and dropping the Unnamed column
df = pd.read_csv("ER_c_transcript.csv").drop("Unnamed: 0", axis=1)

##########################################################################

#Preprocessing
#Sorting the data by "speaker_trustworthy" values
df_sorted1 = df.sort_values("speaker_trustworthy", ascending=False)
#Sorting the data by "speaker_trusts conversational partner" values
df_sorted2 = df.sort_values("speaker_trusts conversational partner", ascending=False)

#extract only ER_text1
#ER_text1 = df_sorted1['ER_text']
ER_text1 = df_sorted1['ER_text'][:10]
ER_text1 = ER_text1.astype('str')

#extract only ER_text2
ER_text2 = df_sorted2['ER_text'][:10]
ER_text2 = ER_text2.astype('str')

# ER_text1
#remove non-ascii characters
ER_text1 = ER_text1.map(lambda x: _removeNonAscii(x))

#drop duplicates
ER_text1.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
ER_lemmatized1 = ER_text1.map(clean_text)

#turn all comments' tokens into one single list
unlist_ER1 = [item for items in ER_lemmatized1 for item in items]

##########################################################################

#ER_text2
#remove non-ascii characters
ER_text2 = ER_text2.map(lambda x: _removeNonAscii(x))

#drop duplicates
ER_text2.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
ER_lemmatized2 = ER_text2.map(clean_text)

#turn all comments' tokens into one single list
unlist_ER2 = [item for items in ER_lemmatized2 for item in items]

### **"speaker_trustworthy"**

**Bigrams**

In [6]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# ER w/ context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_ER1)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_ER1 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_ER1_c = filtered_bi_ER1[:20].bigram.values

#print the filtered bigrams
filtered_bi_ER1.head(10)

Unnamed: 0,bigram,freq
323,"(AirB, B)",2
346,"(time, mitigate)",1
271,"(guest, arrival)",1
294,"(worth, thirty)",1
283,"(AirB, Bs)",1
286,"(New, Year)",1
305,"(last, week)",1
434,"(political, correctness)",1
411,"(show, call)",1
366,"(school, life)",1


**PMI**

In [7]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_ER1_c = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI


### **"speaker_trusts conversational partner"**

In [8]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# ER w/ context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_ER2)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_ER2 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_ER2_c = filtered_bi_ER2[:20].bigram.values

#print the filtered bigrams
filtered_bi_ER2.head(10)

Unnamed: 0,bigram,freq
310,"(U, S)",2
303,"(Climate, Agreement)",2
100,"(AirB, B)",2
302,"(Paris, Climate)",2
357,"(minute, play)",1
366,"(good, thing)",1
239,"(regular, way)",1
307,"(Trump, pull)",1
306,"(President, Trump)",1
337,"(insane, breakthrough)",1


**PMI**

In [9]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_ER2_c = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI


# Interviewer without Context

In [10]:
#Importing ER_nc_Transcript Dataset and dropping the Unnamed column
df = pd.read_csv("ER_nc_transcript.csv").drop("Unnamed: 0", axis=1)

##########################################################################

#Preprocessing
#Sorting the data by "speaker_trustworthy" values
df_sorted1 = df.sort_values("speaker_trustworthy", ascending=False)
#Sorting the data by "speaker_trusts conversational partner" values
df_sorted2 = df.sort_values("speaker_trusts conversational partner", ascending=False)

#extract only ER_text1
#ER_text1 = df_sorted1['ER_text']
ER_text1 = df_sorted1['ER_text'][:10]
ER_text1 = ER_text1.astype('str')

#extract only ER_text2
ER_text2 = df_sorted2['ER_text'][:10]
ER_text2 = ER_text2.astype('str')

# ER_text1
#remove non-ascii characters
ER_text1 = ER_text1.map(lambda x: _removeNonAscii(x))

#drop duplicates
ER_text1.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
ER_lemmatized1 = ER_text1.map(clean_text)

#turn all comments' tokens into one single list
unlist_ER1 = [item for items in ER_lemmatized1 for item in items]

##########################################################################

#ER_text2
#remove non-ascii characters
ER_text2 = ER_text2.map(lambda x: _removeNonAscii(x))

#drop duplicates
ER_text2.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
ER_lemmatized2 = ER_text2.map(clean_text)

#turn all comments' tokens into one single list
unlist_ER2 = [item for items in ER_lemmatized2 for item in items]

### **"speaker trustworthy"**

**Bigrams**

In [11]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# ER w/ context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_ER1)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_ER1 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_ER1_nc = filtered_bi_ER1[:20].bigram.values

#print the filtered bigrams
filtered_bi_ER1.head(10)

Unnamed: 0,bigram,freq
188,"(live, action)",3
306,"(overarch, narrative)",2
109,"(anime, series)",2
275,"(action, series)",2
518,"(carbon, dioxide)",1
519,"(dioxide, increase)",1
543,"(high, profile)",1
457,"(kool, aid)",1
495,"(brilliant, thinker)",1
487,"(Steve, Jobs)",1


**PMI**

In [12]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_ER1_nc = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI
0,"( , and)",2.281355
1,"( , I)",1.861024


###**"speaker_trusts conversational partner"**

In [13]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# ER w/ context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_ER2)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_ER2 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_ER2_nc = filtered_bi_ER2[:20].bigram.values

#print the filtered bigrams
filtered_bi_ER2.head(10)

Unnamed: 0,bigram,freq
132,"(live, action)",3
51,"(anime, series)",2
254,"(overarch, narrative)",2
222,"(action, series)",2
523,"(show, call)",1
545,"(political, correctness)",1
473,"(time, mitigate)",1
462,"(make, sense)",1
452,"(traditional, bank)",1
492,"(school, life)",1


**PMI**

In [14]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_ER2_nc = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI
0,"( , so)",2.375867
1,"( , and)",2.233263
2,"( , I)",1.976771


# **Interviewee with Context**

In [15]:
#Importing EE_c_Transcript data set and dropping the Unnamed column
df = pd.read_csv("EE_c_transcript.csv").drop("Unnamed: 0", axis=1)

##########################################################################

#Preprocessing
#Sorting the data by "speaker_trustworthy" values
df_sorted1 = df.sort_values("speaker_trustworthy", ascending=False)
#Sorting the data by "speaker_trusted by conversational partner" values
df_sorted2 = df.sort_values("speaker_trusted by conversational partner", ascending=False)

#extract only ER_text1
#EE_text1 = df_sorted1['ER_text']
EE_text1 = df_sorted1['ER_text'][:10]
EE_text1 = EE_text1.astype('str')

#extract only EE_text2
EE_text2 = df_sorted2['ER_text'][:10]
EE_text2 = EE_text2.astype('str')

# EE_text1
#remove non-ascii characters
EE_text1 = EE_text1.map(lambda x: _removeNonAscii(x))

#drop duplicates
EE_text1.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
EE_lemmatized1 = EE_text1.map(clean_text)

#turn all comments' tokens into one single list
unlist_EE1 = [item for items in EE_lemmatized1 for item in items]

##########################################################################

#EE_text2
#remove non-ascii characters
EE_text2 = EE_text2.map(lambda x: _removeNonAscii(x))

#drop duplicates
EE_text2.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
EE_lemmatized2 = EE_text2.map(clean_text)

#turn all comments' tokens into one single list
unlist_EE2 = [item for items in EE_lemmatized2 for item in items]

### **"speaker_trustworthy"**

In [16]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# EE w/ context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_EE1)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_EE1 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_EE1_c = filtered_bi_EE1[:20].bigram.values

#print the filtered bigrams
filtered_bi_EE1.head(10)

Unnamed: 0,bigram,freq
175,"(many, people)",2
624,"(federal, government)",2
254,"(element, )",2
502,"(grocery, store)",2
110,"( , n)",2
803,"(old, legacy)",1
804,"(legacy, system)",1
862,"(specific, solution)",1
654,"(big, funding)",1
793,"(understand, identity)",1


In [17]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_EE1_c = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI
0,"( , but)",2.666793
1,"( , and)",2.495966
2,"( , you)",1.929827
3,"( , I)",1.919774


### **"speaker_trusted by conversational partner"**

In [18]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# EE w/ context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_EE2)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_EE2 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_EE2_c = filtered_bi_EE2[:20].bigram.values

#print the filtered bigrams
filtered_bi_EE2.head(10)

Unnamed: 0,bigram,freq
1477,"(San, Francisco)",5
12,"(interest, rate)",3
726,"(net, zero)",3
817,"(vaccine, injury)",3
818,"(injury, compensation)",3
819,"(compensation, fund)",3
143,"(next, year)",2
141,"(rate, hike)",2
1609,"(conference, website)",2
1601,"(design, conference)",2


In [19]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_EE2_c = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI
0,"(you, know)",5.241712
1,"(go, to)",5.003632
2,"(if, you)",4.916617
3,"(you, re)",4.270625
4,"(there, be)",3.737009
5,"(have, a)",3.460103
6,"(in, the)",3.326247
7,"(and, I)",2.531895
8,"(know, )",2.453511
9,"( , but)",2.395795


# **Interviewee without Context**

In [20]:
#Importing EE_nc_Transcript data set and dropping the Unnamed column
df = pd.read_csv("EE_nc_transcript.csv").drop("Unnamed: 0", axis=1)

##########################################################################

#Preprocessing
#Sorting the data by "speaker_trustworthy" values
df_sorted1 = df.sort_values("speaker_trustworthy", ascending=False)
#Sorting the data by "speaker_trusted by conversational partner" values
df_sorted2 = df.sort_values("speaker_trusted by conversational partner", ascending=False)

#extract only ER_text1
#EE_text1 = df_sorted1['ER_text']
EE_text1 = df_sorted1['ER_text'][:10]
EE_text1 = EE_text1.astype('str')

#extract only EE_text2
EE_text2 = df_sorted2['ER_text'][:10]
EE_text2 = EE_text2.astype('str')

# EE_text1
#remove non-ascii characters
EE_text1 = EE_text1.map(lambda x: _removeNonAscii(x))

#drop duplicates
EE_text1.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
EE_lemmatized1 = EE_text1.map(clean_text)

#turn all comments' tokens into one single list
unlist_EE1 = [item for items in EE_lemmatized1 for item in items]

##########################################################################

#EE_text2
#remove non-ascii characters
EE_text2 = EE_text2.map(lambda x: _removeNonAscii(x))

#drop duplicates
EE_text2.drop_duplicates(inplace=True)

#apply function to clean and lemmatize comments
EE_lemmatized2 = EE_text2.map(clean_text)

#turn all comments' tokens into one single list
unlist_EE2 = [item for items in EE_lemmatized2 for item in items]

### **"speaker_trustworthy"**

In [21]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# EE w/o context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_EE1)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_EE1 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_EE1_nc = filtered_bi_EE1[:20].bigram.values

#print the filtered bigrams
filtered_bi_EE1.head(10)

Unnamed: 0,bigram,freq
479,"(grocery, store)",4
250,"(federal, government)",2
209,"(element, )",2
62,"( , n)",2
130,"(many, people)",2
695,"(old, legacy)",1
696,"(legacy, system)",1
769,"(different, form)",1
778,"(third, law)",1
758,"(specific, solution)",1


In [22]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_EE1_nc = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI
0,"( , there)",2.287891
1,"( , and)",2.259876
2,"( , so)",2.256182
3,"( , I)",2.010357
4,"( , you)",1.840432


### **"speaker_trusted by conversational partner"**

In [23]:
# Initialize NLTK's Bigrams Finder
bigrams = nltk.collocations.BigramAssocMeasures()

# EE w/o context
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_EE2)

bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

#filter bigrams
filtered_bi_EE2 = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

# to compare
freq_bi_EE2_nc = filtered_bi_EE2[:20].bigram.values

#print the filtered bigrams
filtered_bi_EE2.head(10)

Unnamed: 0,bigram,freq
129,"(grocery, store)",4
387,"(element, )",2
247,"( , n)",2
686,"(federal, government)",2
311,"(many, people)",2
767,"(financial, year)",1
716,"(funding, decision)",1
715,"(big, funding)",1
736,"(different, infrastructure)",1
737,"(infrastructure, project)",1


In [24]:
#filter for only those with more than 10 occurences
bigramFinder.apply_freq_filter(10)
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

# to compare
pmi_bi_EE2_nc = bigramPMITable[:20].bigram.values

#print the values
bigramPMITable

Unnamed: 0,bigram,PMI
0,"( , and)",2.235047
1,"( , there)",2.142397
2,"( , I)",2.12729
3,"( , so)",2.12729
4,"( , you)",1.772941


**Comparison**

In [25]:
#Bigram comparison of ER with Freq and PMI
bigramsCompare_ER = pd.DataFrame([freq_bi_ER1_c, freq_bi_ER2_c, freq_bi_ER1_nc, freq_bi_ER2_nc, pmi_bi_ER1_c, pmi_bi_ER2_c, pmi_bi_ER1_nc, pmi_bi_ER2_nc]).T
bigramsCompare_ER.columns = ['Filtered Freq on ER w/ C on trustworthy', 'Filtered Freq on ER w/ C on trusts', 'Filtered Freq on ER w/o C on trustworthy', 'Filtered Freq on ER w/o C on trusts', 'PMI on ER w/ C on trustworthy', 'PMI on ER w/ C on trusts', 'PMI on ER w/o C on trustworthy', 'PMI on ER w/o C on trusts']

bigramsCompare_ER

Unnamed: 0,Filtered Freq on ER w/ C on trustworthy,Filtered Freq on ER w/ C on trusts,Filtered Freq on ER w/o C on trustworthy,Filtered Freq on ER w/o C on trusts,PMI on ER w/ C on trustworthy,PMI on ER w/ C on trusts,PMI on ER w/o C on trustworthy,PMI on ER w/o C on trusts
0,"(AirB, B)","(U, S)","(live, action)","(live, action)",,,"( , and)","( , so)"
1,"(time, mitigate)","(Climate, Agreement)","(overarch, narrative)","(anime, series)",,,"( , I)","( , and)"
2,"(guest, arrival)","(AirB, B)","(anime, series)","(overarch, narrative)",,,,"( , I)"
3,"(worth, thirty)","(Paris, Climate)","(action, series)","(action, series)",,,,
4,"(AirB, Bs)","(minute, play)","(carbon, dioxide)","(show, call)",,,,
5,"(New, Year)","(good, thing)","(dioxide, increase)","(political, correctness)",,,,
6,"(last, week)","(regular, way)","(high, profile)","(time, mitigate)",,,,
7,"(political, correctness)","(Trump, pull)","(kool, aid)","(make, sense)",,,,
8,"(show, call)","(President, Trump)","(brilliant, thinker)","(traditional, bank)",,,,
9,"(school, life)","(insane, breakthrough)","(Steve, Jobs)","(school, life)",,,,


In [27]:
#Bigram comparison of EE with Freq and PMI
bigramsCompare_EE = pd.DataFrame([freq_bi_EE1_c, freq_bi_EE2_c, freq_bi_EE1_nc, freq_bi_EE2_nc, pmi_bi_EE1_c, pmi_bi_EE2_c, pmi_bi_EE1_nc, pmi_bi_EE2_nc]).T
bigramsCompare_EE.columns = ['Filtered Freq on EE w/ C on trustworthy', 'Filtered Freq on EE w/ C on trusted', 'Filtered Freq on EE w/o C on trustworthy', 'Filtered Freq on EE w/o C on trusted', 'PMI on EE w/ C on trustworthy', 'PMI on EE w/ C on trusted', 'PMI on EE w/o C on trustworthy', 'PMI on EE w/o C on trusted']

bigramsCompare_EE

Unnamed: 0,Filtered Freq on EE w/ C on trustworthy,Filtered Freq on EE w/ C on trusted,Filtered Freq on EE w/o C on trustworthy,Filtered Freq on EE w/o C on trusted,PMI on EE w/ C on trustworthy,PMI on EE w/ C on trusted,PMI on EE w/o C on trustworthy,PMI on EE w/o C on trusted
0,"(many, people)","(San, Francisco)","(grocery, store)","(grocery, store)","( , but)","(you, know)","( , there)","( , and)"
1,"(federal, government)","(interest, rate)","(federal, government)","(element, )","( , and)","(go, to)","( , and)","( , there)"
2,"(element, )","(net, zero)","(element, )","( , n)","( , you)","(if, you)","( , so)","( , I)"
3,"(grocery, store)","(vaccine, injury)","( , n)","(federal, government)","( , I)","(you, re)","( , I)","( , so)"
4,"( , n)","(injury, compensation)","(many, people)","(many, people)",,"(there, be)","( , you)","( , you)"
5,"(old, legacy)","(compensation, fund)","(old, legacy)","(financial, year)",,"(have, a)",,
6,"(legacy, system)","(next, year)","(legacy, system)","(funding, decision)",,"(in, the)",,
7,"(specific, solution)","(rate, hike)","(different, form)","(big, funding)",,"(and, I)",,
8,"(big, funding)","(conference, website)","(third, law)","(different, infrastructure)",,"(know, )",,
9,"(understand, identity)","(design, conference)","(specific, solution)","(infrastructure, project)",,"( , but)",,


In [None]:
# import textstat

# def features_generator(raw_text,ee):
#   syllable_count, reading_time, flesch_kincaid_grade, letter_count, sentence_count = [], [], [],[],[]
#   for sent in raw_text.iloc[:,-1]:
#     flesch_kincaid = textstat.flesch_kincaid_grade(sent)
#     time = textstat.reading_time(sent)
#     syllable = textstat.syllable_count(sent)
#     l_count = textstat.letter_count(sent, ignore_spaces=True)
#     text_count = textstat.sentence_count(sent)
#     flesch_kincaid_grade.append(flesch_kincaid)
#     reading_time.append(time)
#     syllable_count.append(syllable)
#     letter_count.append(l_count)
#     sentence_count.append(text_count)
#   df = pd.DataFrame([flesch_kincaid_grade,reading_time,syllable_count,letter_count, sentence_count])
#   df = df.T
#   df['speaker_trustworthy'] = raw_text['speaker_trustworthy'].astype(int)
#   if ee:
#     df['speaker_trusted by conversational partner'] = raw_text['speaker_trusted by conversational partner'].astype(int)
#     df.columns = ["flesch_kincaid_grade","reading_time","syllable_count","letter_count", "sentence_count",'speaker_trustworthy','speaker_trusted by conversational partner']
#   else:
#     df['speaker_trusts conversational partner'] = raw_text['speaker_trusts conversational partner'].astype(int)
#     df.columns = ["flesch_kincaid_grade","reading_time","syllable_count","letter_count", "sentence_count",'speaker_trustworthy','speaker_trusts conversational partner']
#   return df
