# Import packages

In [1]:
import nltk
import pandas as pd
import os
import re
import string

# Read-in dataset



In [2]:
df = pd.read_excel('Cleaned/final/all_sessions.xlsx')

In [3]:
# Brief glance at data

df.head()

Unnamed: 0,Date,Question,Response
0,2024-03-19,Have you heard of the crisis of reproducibility?:,"Yes, I think so."
1,2024-03-19,Have you heard of the crisis of reproducibility?:,"Yes, I think so."
2,2024-03-19,Have you heard of the crisis of reproducibility?:,"No, I don't think so."
3,2024-03-19,Have you heard of the crisis of reproducibility?:,"No, I don't think so."
4,2024-03-19,Have you heard of the crisis of reproducibility?:,"Yes, and I have read all about it."


In [4]:
# Quick glance at questions - need to narrow down ones we want in wordcloud

df.Question.unique()

array(['Have you heard of the crisis of reproducibility?:',
       'Have you tried to reproduce any research before?:',
       'What did you try (or want to try) to repdroduce?:',
       'Can you name some factors behind this crisis?:',
       'Does the reproducibility crisis affect some researhers more than others?:',
       'What impacts matter most to you?:',
       'Why do these impacts affect some researchers more than others?:',
       'What "soft fraud" or questionable research practicises have you seen/done/heard of?:',
       'What makes researchers feel (especially) vulnerable:',
       'What might reduce feelings of researcher vulnerability?:',
       'Which of these reactions makes more sense to you?:',
       'Do these proposed solutions make the vulnerability better or worse?:',
       'Any additional thoughts on reproducibility in the "soft sciences"?:',
       'Have you heard of GTD?:',
       'What capture tools have you used or wanted to use?:',
       'What can you t

# Clean data

In [5]:
# Get rid of numerical answers

df = df[~df['Response'].astype(str).str.match(r'^\s*\d+\.?\d*\s*$')]

In [6]:
# Manually identified questions that don't include freetext responses
# Removed from dataset

df.Question.unique()

array(['Have you heard of the crisis of reproducibility?:',
       'Have you tried to reproduce any research before?:',
       'What did you try (or want to try) to repdroduce?:',
       'Can you name some factors behind this crisis?:',
       'Does the reproducibility crisis affect some researhers more than others?:',
       'What impacts matter most to you?:',
       'Why do these impacts affect some researchers more than others?:',
       'What "soft fraud" or questionable research practicises have you seen/done/heard of?:',
       'What makes researchers feel (especially) vulnerable:',
       'What might reduce feelings of researcher vulnerability?:',
       'Which of these reactions makes more sense to you?:',
       'Do these proposed solutions make the vulnerability better or worse?:',
       'Any additional thoughts on reproducibility in the "soft sciences"?:',
       'Have you heard of GTD?:',
       'What capture tools have you used or wanted to use?:',
       'What can you t

In [7]:
# Specify unwanted questions

unwanted_qs = ['Have you heard of the crisis of reproducibility?:', 'Have you tried to reproduce any research before?:', 'What did you try (or want to try) to repdroduce?:', 'Which of these reactions makes more sense to you?:', 'Have you heard of GTD?:', 'What do you think is harder?:', 'Do your projects have a legacy plan?:', 'Have you ever asked a colleague to see any of their formal documents?:', 'Have you used git?:']

In [8]:
# Subset based on unwanted questions - use tilde to specify we DON'T want this data

df = df[~df.Question.isin(unwanted_qs)].reset_index(drop = True)

In [9]:
df.Question.unique()

array(['Can you name some factors behind this crisis?:',
       'Does the reproducibility crisis affect some researhers more than others?:',
       'What impacts matter most to you?:',
       'Why do these impacts affect some researchers more than others?:',
       'What "soft fraud" or questionable research practicises have you seen/done/heard of?:',
       'What makes researchers feel (especially) vulnerable:',
       'What might reduce feelings of researcher vulnerability?:',
       'Do these proposed solutions make the vulnerability better or worse?:',
       'Any additional thoughts on reproducibility in the "soft sciences"?:',
       'What capture tools have you used or wanted to use?:',
       'What can you tell us about how you combine and/or sort?:',
       'What motivates or incentivises you?:',
       'Which of these has frustrated you?:',
       'Share some of collab wins and/or fails!:',
       'Any more ideas about documentation?:',
       'What tools do you like to use f

# Pre-process data

In [10]:
# First, let's lowercase our string entries in response column
# We'll overwrite response column as this isn't a major change

df['Lower_cased'] = df['Response'].str.lower()

In [11]:
# Check it's worked..

df.head()

Unnamed: 0,Date,Question,Response,Lower_cased
0,2024-03-19,Can you name some factors behind this crisis?:,Code Data Methods,code data methods
1,2024-03-19,Can you name some factors behind this crisis?:,Ignorance,ignorance
2,2024-03-19,Can you name some factors behind this crisis?:,pressure_to_publish,pressure_to_publish
3,2024-03-19,Can you name some factors behind this crisis?:,Data,data
4,2024-03-19,Does the reproducibility crisis affect some re...,Less emphasis in certain fields about document...,less emphasis in certain fields about document...


In [12]:
# Replace "/" and "_" with " "
df['Cleaned_Response'] = df['Lower_cased'].str.replace('/', ' ', regex=False).str.replace('_', ' ', regex=False)

In [13]:
# Tokenise strings in response column
# Now, let's create a new column to preserve initial string entries

df['Tokenised'] = df.apply(lambda row: nltk.word_tokenize(row['Cleaned_Response']), axis=1)

In [14]:
df.head()

Unnamed: 0,Date,Question,Response,Lower_cased,Cleaned_Response,Tokenised
0,2024-03-19,Can you name some factors behind this crisis?:,Code Data Methods,code data methods,code data methods,"[code, data, methods]"
1,2024-03-19,Can you name some factors behind this crisis?:,Ignorance,ignorance,ignorance,[ignorance]
2,2024-03-19,Can you name some factors behind this crisis?:,pressure_to_publish,pressure_to_publish,pressure to publish,"[pressure, to, publish]"
3,2024-03-19,Can you name some factors behind this crisis?:,Data,data,data,[data]
4,2024-03-19,Does the reproducibility crisis affect some re...,Less emphasis in certain fields about document...,less emphasis in certain fields about document...,less emphasis in certain fields about document...,"[less, emphasis, in, certain, fields, about, d..."


In [15]:
# Remove punctuation by defining translation table and applying it to tokenised lists in column
translator = str.maketrans('', '', string.punctuation)

df['Remove_punct'] = df['Tokenised'].apply(lambda tokens: [token.translate(translator) for token in tokens])

In [16]:
df.head(50)

Unnamed: 0,Date,Question,Response,Lower_cased,Cleaned_Response,Tokenised,Remove_punct
0,2024-03-19,Can you name some factors behind this crisis?:,Code Data Methods,code data methods,code data methods,"[code, data, methods]","[code, data, methods]"
1,2024-03-19,Can you name some factors behind this crisis?:,Ignorance,ignorance,ignorance,[ignorance],[ignorance]
2,2024-03-19,Can you name some factors behind this crisis?:,pressure_to_publish,pressure_to_publish,pressure to publish,"[pressure, to, publish]","[pressure, to, publish]"
3,2024-03-19,Can you name some factors behind this crisis?:,Data,data,data,[data],[data]
4,2024-03-19,Does the reproducibility crisis affect some re...,Less emphasis in certain fields about document...,less emphasis in certain fields about document...,less emphasis in certain fields about document...,"[less, emphasis, in, certain, fields, about, d...","[less, emphasis, in, certain, fields, about, d..."
5,2024-03-19,Does the reproducibility crisis affect some re...,Lack of training,lack of training,lack of training,"[lack, of, training]","[lack, of, training]"
6,2024-03-19,Does the reproducibility crisis affect some re...,Less technological fields: literature for example,less technological fields: literature for example,less technological fields: literature for example,"[less, technological, fields, :, literature, f...","[less, technological, fields, , literature, fo..."
7,2024-03-19,Does the reproducibility crisis affect some re...,Fear of research being taken,fear of research being taken,fear of research being taken,"[fear, of, research, being, taken]","[fear, of, research, being, taken]"
8,2024-03-19,Does the reproducibility crisis affect some re...,The field of the research,the field of the research,the field of the research,"[the, field, of, the, research]","[the, field, of, the, research]"
9,2024-03-19,Does the reproducibility crisis affect some re...,"Type of work; article, poster, paper, workshop...","type of work; article, poster, paper, workshop...","type of work; article, poster, paper, workshop...","[type, of, work, ;, article, ,, poster, ,, pap...","[type, of, work, , article, , poster, , paper,..."


In [17]:
# Special phrases and their possible variations
special_phrases = ["imposter syndrome", "to do list", "meta data", "p-hacking", "share point"]


import spacy
from spacy.matcher import Matcher

# Load SpaCy's English-language model
nlp = spacy.load("en_core_web_sm")

# Create the Matcher object and add the special phrases as patterns
matcher = Matcher(nlp.vocab)
for phrase in special_phrases:
    pattern = [{"LOWER": word} for word in phrase.split()]  # Create a pattern for the current phrase
    # Create a pattern for the current phrase
    matcher.add(phrase.upper().replace(" ", "_"), [pattern])  # Add the pattern to the matcher

    
def merge_phrases(tokens):
    # Join tokens to form a single string (necessary for SpaCy processing)
    text = " ".join(tokens)
    doc = nlp(text)
    matches = matcher(doc)
    with doc.retokenize() as retokenizer:
        for _, start, end in matches:
            retokenizer.merge(doc[start:end])

    # Extract the merged tokens from the doc
    merged_tokens = [token.text for token in doc]
    return merged_tokens

In [18]:
df['Phrases_pres'] = df['Remove_punct'].apply(merge_phrases)

In [19]:
df

Unnamed: 0,Date,Question,Response,Lower_cased,Cleaned_Response,Tokenised,Remove_punct,Phrases_pres
0,2024-03-19,Can you name some factors behind this crisis?:,Code Data Methods,code data methods,code data methods,"[code, data, methods]","[code, data, methods]","[code, data, methods]"
1,2024-03-19,Can you name some factors behind this crisis?:,Ignorance,ignorance,ignorance,[ignorance],[ignorance],[ignorance]
2,2024-03-19,Can you name some factors behind this crisis?:,pressure_to_publish,pressure_to_publish,pressure to publish,"[pressure, to, publish]","[pressure, to, publish]","[pressure, to, publish]"
3,2024-03-19,Can you name some factors behind this crisis?:,Data,data,data,[data],[data],[data]
4,2024-03-19,Does the reproducibility crisis affect some re...,Less emphasis in certain fields about document...,less emphasis in certain fields about document...,less emphasis in certain fields about document...,"[less, emphasis, in, certain, fields, about, d...","[less, emphasis, in, certain, fields, about, d...","[less, emphasis, in, certain, fields, about, d..."
...,...,...,...,...,...,...,...,...
100,2024-03-19,"If you can't share your data, describe it. Wha...",Data collection method. Sample size. Variable ...,data collection method. sample size. variable ...,data collection method. sample size. variable ...,"[data, collection, method, ., sample, size, .,...","[data, collection, method, , sample, size, , v...","[data, collection, method, , sample, size, ,..."
101,2024-03-19,"If you can't share your data, describe it. Wha...","dimensions, type of variables, size","dimensions, type of variables, size","dimensions, type of variables, size","[dimensions, ,, type, of, variables, ,, size]","[dimensions, , type, of, variables, , size]","[dimensions, , type, of, variables, , size]"
102,2024-03-19,"If you can't share your data, describe it. Wha...","Type, scope, collection methods, visualisations?","type, scope, collection methods, visualisations?","type, scope, collection methods, visualisations?","[type, ,, scope, ,, collection, methods, ,, vi...","[type, , scope, , collection, methods, , visua...","[type, , scope, , collection, methods, , vi..."
103,2024-03-19,"If you can't share your data, describe it. Wha...",We try to include a data template &/or data di...,we try to include a data template &/or data di...,we try to include a data template & or data di...,"[we, try, to, include, a, data, template, &, o...","[we, try, to, include, a, data, template, , or...","[we, try, to, include, a, data, template, , o..."


In [20]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and wordnet resources from NLTK
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK's lemmatizer and stop words set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def process_text(word_list):
    # Remove stop words and lemmatize the remaining words
    return [lemmatizer.lemmatize(word) for word in word_list if word not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/loucap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/loucap/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
df['Rem_stop_words'] = df['Phrases_pres'].apply(process_text)

In [22]:
tokens = df['Rem_stop_words'].explode()

# Get value counts of each token
token_counts = tokens.value_counts()

print(token_counts)

Rem_stop_words
               135
data            12
code            10
research         9
time             8
              ... 
sticker          1
planner          1
reward           1
feel             1
aggregation      1
Name: count, Length: 428, dtype: int64


In [23]:
tokens = df['Rem_stop_words'].explode()

# Get value counts of each token
token_counts = tokens.value_counts()

print(token_counts)

Rem_stop_words
               135
data            12
code            10
research         9
time             8
              ... 
sticker          1
planner          1
reward           1
feel             1
aggregation      1
Name: count, Length: 428, dtype: int64


In [29]:
# Filter out rows with empty or whitespace-only strings, then explode
tokens = df['Rem_stop_words'][df['Rem_stop_words'].str.strip() != ''].explode()

# Get value counts of each token
token_counts = tokens.value_counts()

final = token_counts[1:,]

In [33]:
final.nlargest(40)

Rem_stop_words
data             12
code             10
research          9
time              8
week              7
variable          7
project           6
note              6
documentation     6
different         5
etc               5
using             5
colleague         5
use               5
list              5
method            5
field             5
good              4
make              4
set               4
team              4
task              4
email             4
notion            4
document          4
used              4
git               4
result            3
journal           3
reporting         3
back              3
phacking          3
wait              3
concern           3
template          3
version           3
’                 3
wrong             3
try               3
old               3
Name: count, dtype: int64

In [25]:
df.to_excel('test.xlsx', index = False)