# Preprocessing the Metadata

Some nice description

## Setup 

#### Dependencies

In [1]:
# ----- GENERAL DEPENDENCIES ----- #
#operating system 
import os 
import sys
sys.path.append(os.path.join(".."))

#File paths and time-keeping 
import glob
from tqdm import tqdm

#Text processing and plotting 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Paths 

In [2]:
# --- Output directory ---
output_directory = "../out/"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [3]:
# --- Data path --- 
data_path = '../data/meta_data/'

#### Load in the Metadata

In [4]:
metadata = pd.read_csv(data_path + "full_cohort_meta_cleaned.csv", index_col=0)

In [5]:
# view
print(len(metadata))
metadata[:5]

84


Unnamed: 0,Authors,Author.Full.Names,Article.Title,Source.Title,Language,Document.Type,Author.Keywords,Keywords.Plus,Abstract,Email.Addresses,...,Publication.Date,Publication.Year,DOI,Book.DOI,Pubmed.Id,Open.Access.Designations,Highly.Cited.Status,Hot.Paper.Status,Publication.Month,Date
1,"Griffiths, RR; Johnson, MW; Carducci, MA; Umbr...","Griffiths, Roland R.; Johnson, Matthew W.; Car...",Psilocybin produces substantial and sustained ...,JOURNAL OF PSYCHOPHARMACOLOGY,English,Article,Psilocybin; hallucinogen; cancer; anxiety; dep...,QUALITY-OF-LIFE; MYSTICAL EXPERIENCE QUESTIONN...,"Cancer patients often develop chronic, clinica...",rgriff@jhmi.edu,...,12,2016,10.1177/0269881116675513,,27909165.0,"hybrid, Green Published",Y,N,Dec,2016-12-01
2,"Grob, CS; Danforth, AL; Chopra, GS; Hagerty, M...","Grob, Charles S.; Danforth, Alicia L.; Chopra,...",Pilot Study of Psilocybin Treatment for Anxiet...,ARCHIVES OF GENERAL PSYCHIATRY,English,Article,,PSYCHOTHERAPY,Context: Researchers conducted extensive inves...,cgrob@labiomed.org,...,1,2011,10.1001/archgenpsychiatry.2010.116,,20819978.0,Bronze,Y,N,Jan,2011-01-01
3,"Nichols, DE","Nichols, David E.",Psychedelics,PHARMACOLOGICAL REVIEWS,English,Review,,LYSERGIC-ACID DIETHYLAMIDE; SEROTONIN 5-HT2A R...,Psychedelics (serotonergic hallucinogens) are ...,drdave@purdue.edu,...,4,2016,10.1124/pr.115.011478,,26841800.0,"Bronze, Green Published",Y,N,Apr,2016-04-01
4,"Carhart-Harris, RL; Bolstridge, M; Rucker, J; ...","Carhart-Harris, Robin L.; Bolstridge, Mark; Ru...",Psilocybin with psychological support for trea...,LANCET PSYCHIATRY,English,Article,,LYSERGIC-ACID DIETHYLAMIDE; 5-HT2A RECEPTOR; LSD,Background Psilocybin is a serotonin receptor ...,r.carhart-harris@imperial.ac.uk,...,7,2016,10.1016/S2215-0366(16)30065-7,,27210031.0,"Green Published, Green Submitted, hybrid",Y,N,Jul,2016-07-01
5,"Carhart-Harris, RL; Erritzoe, D; Williams, T; ...","Carhart-Harris, Robin L.; Erritzoe, David; Wil...",Neural correlates of the psychedelic state as ...,PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCE...,English,Article,default mode network; hallucinogens; serotonin...,MEDIAL PREFRONTAL CORTEX; MYSTICAL-TYPE EXPERI...,Psychedelic drugs have a long history of use i...,d.nutt@imperial.ac.uk,...,2,2012,10.1073/pnas.1119598109,,22308440.0,"Bronze, Green Published",Y,N,Feb,2012-02-01


## Extract list of author names

#### Get the author names into a list of individual authors 

In [6]:
# --- Extract the author name column --- 
authors = metadata['Author.Full.Names']

In [None]:
# --- check the datatypes --- 
print(type(authors))
print(len(authors))

In [None]:
# --- convert series to list --- 
authors = authors.values.tolist()
print(type(authors))
print(len(authors))

In [None]:
# --- inspect authors of one paper --- 
authors[0]

In [None]:
# --- split into individual authors ---
authors_split = []

for row in authors:
    split_authors = row.split(';')
    for author in split_authors:
        authors_split.append(author)

In [None]:
print(f"There are {len(authors_split)} authors withint the research papers.\n")
authors_split[:10]

#### Reverse the author names

_We're doing this because it's more typical to find a first name followed by a sirname in news articles_

In [None]:
# --- reverse the author names (to have first name, then sirname) --- 
author_names = []

for author in authors_split:
    author_name = author.split(',')
    name_reversed = author_name[1] + ' ' + author_name[0]
    author_names.append(name_reversed)
    
print(len(author_names))

In [None]:
# inspect 
author_names[:5]

#### Standardise spacing 

In [None]:
# --- Standardise spacing before and after names ---
#first, strip any spaces that may be there (before, and then after)
names_standardised_space = []

for name in author_names:
    name_cleaned = name.lstrip()
    name_cleaned = name_cleaned.rstrip()
    names_standardised_space.append(name_cleaned)

print(len(names_standardised_space))

#### Remove initals

In [None]:
# --- Remove initials --- 
names_remove_initials = []

for name in names_standardised_space:
    #check if the name has an initial 
    if '.' in name:
        #if it does, find the index of the dot 
        stop = name.find('.')
        #take one index less of this to include initial letter
        start = stop - 1
        #remove these from the name 
        initials_removed = name[0: start:] + name[stop + 1::]
        names_remove_initials.append(initials_removed)
    else:
        names_remove_initials.append(name)

print(names_remove_initials[:5])
print(f"There are {len(names_remove_initials)} authors.")

In [None]:
# --- remove second layer of initials --- 
names_remove_initials2 = []

for name in names_remove_initials:
    #check if the name has another initial 
    if '.' in name:
        #if it does, find the index of the dot 
        stop = name.find('.')
        #take one index less of this to include initial letter
        start = stop - 1
        #remove these from the name 
        initials_removed = name[0: start:] + name[stop + 1::]
        names_remove_initials2.append(initials_removed)
    else:
        names_remove_initials2.append(name)

print(f"There are {len(names_remove_initials2)} authors.")


In [None]:
# --- remove third layer of initials --- 
names_remove_initials3 = []

for name in names_remove_initials2:
    #check if the name has another initial 
    if '.' in name:
        #if it does, find the index of the dot 
        stop = name.find('.')
        #take one index less of this to include initial letter
        start = stop - 1
        #remove these from the name 
        initials_removed = name[0: start:] + name[stop + 1::]
        names_remove_initials3.append(initials_removed)
    else:
        names_remove_initials3.append(name)

print(f"There are {len(names_remove_initials3)} authors.")


#### Standardise spacing between names 

In [None]:
# --- Fix spacing between names --- 
names_one_space_between = []

for name in names_remove_initials3:
    #split the string and join them together with a single space
    standard_space = ' '.join(name.split())
    names_one_space_between.append(standard_space)

names_one_space_between[:5]

#### Centre names with space before and after 

In [None]:
# --- Add space before and after name to enable easier searching for them ---
names_of_authors = []

for name in names_one_space_between:
    name_len = len(name) + 2
    name_centered = name.center(name_len)
    names_of_authors.append(name_centered)

names_of_authors[:5]

#### Finally, remove duplicates 

In [None]:
# --- Remove duplicates by converting to a dictionary, and then back to a list --- 
print(f"There were originally {len(names_of_authors)} names in the author list.")

names_of_authors_no_dups = list(dict.fromkeys(names_of_authors))
print(f"With duplicates removed, there are now {len(names_of_authors_no_dups)} names in the author list.")

In [None]:
# --- inspect author list ---
author_names = names_of_authors_no_dups

author_names

#### Save the list of author names 

In [None]:
output_file = open('../data/preprocessed_data/author_names.txt', 'w')

for name in author_names:
    output_file.write(name + '\n')

output_file.close()

## Extracting the Dates to Search

Here, we want to search for the news articles which may be related to each of these research papers. Thus, we'll set up some criteria [better description]

In [7]:
# --- Make a new dataframe for working with dates ---
date_df = metadata

# --- Make date colume into date_time format ---
date_df['Date'] = pd.to_datetime(date_df['Date'], format="%Y-%m-%d")
date_df[:3]

Unnamed: 0,Authors,Author.Full.Names,Article.Title,Source.Title,Language,Document.Type,Author.Keywords,Keywords.Plus,Abstract,Email.Addresses,...,Publication.Date,Publication.Year,DOI,Book.DOI,Pubmed.Id,Open.Access.Designations,Highly.Cited.Status,Hot.Paper.Status,Publication.Month,Date
1,"Griffiths, RR; Johnson, MW; Carducci, MA; Umbr...","Griffiths, Roland R.; Johnson, Matthew W.; Car...",Psilocybin produces substantial and sustained ...,JOURNAL OF PSYCHOPHARMACOLOGY,English,Article,Psilocybin; hallucinogen; cancer; anxiety; dep...,QUALITY-OF-LIFE; MYSTICAL EXPERIENCE QUESTIONN...,"Cancer patients often develop chronic, clinica...",rgriff@jhmi.edu,...,12,2016,10.1177/0269881116675513,,27909165.0,"hybrid, Green Published",Y,N,Dec,2016-12-01
2,"Grob, CS; Danforth, AL; Chopra, GS; Hagerty, M...","Grob, Charles S.; Danforth, Alicia L.; Chopra,...",Pilot Study of Psilocybin Treatment for Anxiet...,ARCHIVES OF GENERAL PSYCHIATRY,English,Article,,PSYCHOTHERAPY,Context: Researchers conducted extensive inves...,cgrob@labiomed.org,...,1,2011,10.1001/archgenpsychiatry.2010.116,,20819978.0,Bronze,Y,N,Jan,2011-01-01
3,"Nichols, DE","Nichols, David E.",Psychedelics,PHARMACOLOGICAL REVIEWS,English,Review,,LYSERGIC-ACID DIETHYLAMIDE; SEROTONIN 5-HT2A R...,Psychedelics (serotonergic hallucinogens) are ...,drdave@purdue.edu,...,4,2016,10.1124/pr.115.011478,,26841800.0,"Bronze, Green Published",Y,N,Apr,2016-04-01


In [8]:
# --- Make a column with dates shifted 3 months --- 
date_df['DateShift'] = pd.DatetimeIndex(date_df['Date'] ) + pd.DateOffset(months = 3)

In [9]:
date_df[:3]

Unnamed: 0,Authors,Author.Full.Names,Article.Title,Source.Title,Language,Document.Type,Author.Keywords,Keywords.Plus,Abstract,Email.Addresses,...,Publication.Year,DOI,Book.DOI,Pubmed.Id,Open.Access.Designations,Highly.Cited.Status,Hot.Paper.Status,Publication.Month,Date,DateShift
1,"Griffiths, RR; Johnson, MW; Carducci, MA; Umbr...","Griffiths, Roland R.; Johnson, Matthew W.; Car...",Psilocybin produces substantial and sustained ...,JOURNAL OF PSYCHOPHARMACOLOGY,English,Article,Psilocybin; hallucinogen; cancer; anxiety; dep...,QUALITY-OF-LIFE; MYSTICAL EXPERIENCE QUESTIONN...,"Cancer patients often develop chronic, clinica...",rgriff@jhmi.edu,...,2016,10.1177/0269881116675513,,27909165.0,"hybrid, Green Published",Y,N,Dec,2016-12-01,2017-03-01
2,"Grob, CS; Danforth, AL; Chopra, GS; Hagerty, M...","Grob, Charles S.; Danforth, Alicia L.; Chopra,...",Pilot Study of Psilocybin Treatment for Anxiet...,ARCHIVES OF GENERAL PSYCHIATRY,English,Article,,PSYCHOTHERAPY,Context: Researchers conducted extensive inves...,cgrob@labiomed.org,...,2011,10.1001/archgenpsychiatry.2010.116,,20819978.0,Bronze,Y,N,Jan,2011-01-01,2011-04-01
3,"Nichols, DE","Nichols, David E.",Psychedelics,PHARMACOLOGICAL REVIEWS,English,Review,,LYSERGIC-ACID DIETHYLAMIDE; SEROTONIN 5-HT2A R...,Psychedelics (serotonergic hallucinogens) are ...,drdave@purdue.edu,...,2016,10.1124/pr.115.011478,,26841800.0,"Bronze, Green Published",Y,N,Apr,2016-04-01,2016-07-01


In [10]:
dates_to_unzip = date_df[['Date', 'DateShift']]
dates_to_unzip

Unnamed: 0,Date,DateShift
1,2016-12-01,2017-03-01
2,2011-01-01,2011-04-01
3,2016-04-01,2016-07-01
4,2016-07-01,2016-10-01
5,2012-02-01,2012-05-01
...,...,...
80,2021-02-01,2021-05-01
81,2020-01-01,2020-04-01
82,2020-11-01,2021-02-01
83,2020-04-01,2020-07-01


In [11]:
test = dates_to_unzip 

In [14]:
test = sorted(test['Date'])

In [18]:
test

[Timestamp('2011-01-01 00:00:00'),
 Timestamp('2012-02-01 00:00:00'),
 Timestamp('2012-12-01 00:00:00'),
 Timestamp('2013-12-01 00:00:00'),
 Timestamp('2014-02-01 00:00:00'),
 Timestamp('2015-02-01 00:00:00'),
 Timestamp('2015-03-01 00:00:00'),
 Timestamp('2015-08-01 00:00:00'),
 Timestamp('2015-10-01 00:00:00'),
 Timestamp('2016-02-01 00:00:00'),
 Timestamp('2016-02-01 00:00:00'),
 Timestamp('2016-03-01 00:00:00'),
 Timestamp('2016-04-01 00:00:00'),
 Timestamp('2016-04-01 00:00:00'),
 Timestamp('2016-06-01 00:00:00'),
 Timestamp('2016-07-01 00:00:00'),
 Timestamp('2016-09-01 00:00:00'),
 Timestamp('2016-09-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2017-01-01 00:00:00'),
 Timestamp('2017-02-01 00:00:00'),
 Timestamp('2017-04-01 00:00:00'),
 Timestamp('2017-07-01 00:00:00'),
 Timestamp('2017-07-

In [20]:
# remove duplicates 
test = list(dict.fromkeys(test))
print(len(test))

48


In [23]:
test = sorted(test)
test

[Timestamp('2011-01-01 00:00:00'),
 Timestamp('2012-02-01 00:00:00'),
 Timestamp('2012-12-01 00:00:00'),
 Timestamp('2013-12-01 00:00:00'),
 Timestamp('2014-02-01 00:00:00'),
 Timestamp('2015-02-01 00:00:00'),
 Timestamp('2015-03-01 00:00:00'),
 Timestamp('2015-08-01 00:00:00'),
 Timestamp('2015-10-01 00:00:00'),
 Timestamp('2016-02-01 00:00:00'),
 Timestamp('2016-03-01 00:00:00'),
 Timestamp('2016-04-01 00:00:00'),
 Timestamp('2016-06-01 00:00:00'),
 Timestamp('2016-07-01 00:00:00'),
 Timestamp('2016-09-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2017-01-01 00:00:00'),
 Timestamp('2017-02-01 00:00:00'),
 Timestamp('2017-04-01 00:00:00'),
 Timestamp('2017-07-01 00:00:00'),
 Timestamp('2017-08-01 00:00:00'),
 Timestamp('2017-09-01 00:00:00'),
 Timestamp('2017-10-01 00:00:00'),
 Timestamp('2017-11-01 00:00:00'),
 Timestamp('2018-01-01 00:00:00'),
 Timestamp('2018-02-01 00:00:00'),
 Timestamp('2018-03-01 00:00:00'),
 Timestamp('2018-06-01 00:00:00'),
 Timestamp('2018-07-