<a href="https://colab.research.google.com/github/rgranit/academix-ydata-project/blob/master/code/1_clean_NIH_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3>Outline</h3>

From NIH merged database, take PI_NAMEs and create a new column `cleaned_name` which:
1. Removes any parenthesis or extra information from names (that includes possible other last names for researcher)
2. Turns TUAN, ROCKY S -> Tuan, RS
3. If there are several names, it keeps them in the same cell with a ';' as a divider. 

## Initializations

In [0]:
import os, urllib, glob, sys
from getpass import getpass

user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
cmd_string = "! git clone https://{0}:{1}@github.com/rgranit/academix-ydata-project AYP".format(user, password)

os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

User name: ShaulSolomon
Password: ··········


In [0]:
% cd '/content/AYP/code'

/content/AYP/code


In [0]:
!pip install boto



In [0]:
''' Initializations '''
import re
import pandas as pd
import s3_functions
import numpy as np

## Implementation

In [0]:
df = s3_functions.get_dataframe_from_s3(file='NIH_precleaning.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)

In [307]:
print(df.head())
print(df.shape)

       PMID PROJECT_NUMBER    FY    PI_IDS        PI_NAMEs
0  19415686    ZIAAR041131  2009  1858712;  TUAN, ROCKY S;
1  19650110    ZIAAR041131  2009  1858712;  TUAN, ROCKY S;
2  19283731    ZIAAR041131  2009  1858712;  TUAN, ROCKY S;
3  19274753    ZIAAR041131  2009  1858712;  TUAN, ROCKY S;
4  19479830    ZIAAR041131  2009  1858712;  TUAN, ROCKY S;
(5273791, 5)


In [0]:
def separate_names(names):
  '''
  Input: a name "LASTNAME, FIRSTNAME MIDDLENAME|MIDDLE_INITIAL;" ex: TUAN, ROCKY S
  Return: "Lastname, Firstinitial_Middleinitial" ex: Tuan, RS
  '''
  try:
    cleaned_name = ""
    each_name = names.strip().replace(', ', ' ').split(' ')
    if (len(each_name) == 1):
      #Only has a last name
      cleaned_name = np.NaN
      ### We consider this data unfit and need to remove it right after. ###
    elif (len(each_name) == 2):
      #Just has a last name and a first name
      cleaned_name = each_name[0].capitalize() + ", " + each_name[1][0].upper()
    else:
      if len(each_name[2]) == 1:
        #If it's a middle initial
        cleaned_name = each_name[0].capitalize() + ", " + each_name[1][0].upper() + each_name[2].upper()
      else:
        #If it's a middle name
        cleaned_name = each_name[0].capitalize() + ", " + each_name[1][0].upper() + each_name[2][0].upper()
  except:
    print(names)
  return cleaned_name

def clean_name(name):
  '''
  PI_NAMEs can have several names at the Private investigators.
  input: A cell of df["PI_NAMEs"]
  output: cleaned version of each PI in "PI_NAMEs"
  '''
  name = name.lower()
  names = name.split(';')


  #remove contact names in database
  names = [re.sub(r'.*\(contact\).*',"",name) for name in names]
  #remove optional other last name from name
  names = [re.sub(r' \(.*\)',"",name) for name in names]
  #remove extra spaces
  names = [re.sub(r' +',' ',name) for name in names]
  #for the few cases where have " , " instead of ", "
  names = [re.sub(r' , ',', ',name) for name in names]
  #remove empty strings
  names = list(filter(lambda x: x != "", names))


  if len(names) == 0:
    print("ERROR WITH NAME: ", name)
    return "ERROR WITH NAME"

  elif len(names) == 1:
    new_name = separate_names(names[0])
      
  else:
    new_name = []
    for name in names:
      added_name = separate_names(name)
      # We want unfit names to get a 'nan' value so we can drop them, but will many people, we want to skip the name
      if isinstance(added_name,str):
        new_name.append([added_name])

  return new_name

In [0]:
# [clean_name(name) for name in df[df.index == 5273788].PI_NAMEs]

In [0]:
df['cleaned_name'] = [clean_name(name) for name in df['PI_NAMEs']]

In [331]:
df.head()

Unnamed: 0,PMID,PROJECT_NUMBER,FY,PI_IDS,PI_NAMEs,cleaned_name
0,19415686,ZIAAR041131,2009,1858712;,"TUAN, ROCKY S;","Tuan, RS"
1,19650110,ZIAAR041131,2009,1858712;,"TUAN, ROCKY S;","Tuan, RS"
2,19283731,ZIAAR041131,2009,1858712;,"TUAN, ROCKY S;","Tuan, RS"
3,19274753,ZIAAR041131,2009,1858712;,"TUAN, ROCKY S;","Tuan, RS"
4,19479830,ZIAAR041131,2009,1858712;,"TUAN, ROCKY S;","Tuan, RS"


In [0]:
#Get rid of NaN data
df.dropna(inplace=True)
df.loc[df.cleaned_name.isna()]

#Weirdly, the PI_IDS themselves were unclean, with empty spaces at the end, .strip() cleans it
df['PI_IDS'] = [x.strip() for x in df['PI_IDS']]

In [0]:
def split_data_frame_list(df, 
                       target_column,
                      output_type=float):
    ''' 
    Accepts a column with multiple types and splits list variables to several rows.

    df: dataframe to split
    target_column: the column containing the values to split
    output_type: type of all outputs
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
          for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = output_type(s[0])
            row_accumulator.append(new_row)
        else:
          new_row = row.to_dict()
          new_row[target_column] = split_row
          row_accumulator.append(new_row)
  
    df.apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
  
    return new_df

In [0]:
#This function splits the 'cleaned_name' values into independent rows if there is more than one value
df = split_data_frame_list(df=df,target_column="cleaned_name",output_type=str)

In [0]:
#drop duplicates
df.drop_duplicates(subset=['PMID','PROJECT_NUMBER','PI_IDS','cleaned_name'],keep='first',inplace=True)

In [374]:
#test ecxample that I used to prove effectiveness
df[df["PMID"] == 25747927]

Unnamed: 0,PMID,PROJECT_NUMBER,FY,PI_IDS,PI_NAMEs,cleaned_name
948400,25747927,K05DA017009,2009,1871862;,"WOODY, GEORGE EDWARD;","Woody, GE"
1419220,25747927,U10DA013043,2015,1871862;,"WOODY, GEORGE EDWARD;","Woody, GE"
2043325,25747927,R01DA027633,2009,6572943;,"RAWSON, RICHARD A;","Rawson, RA"
2043337,25747927,D43TW009102,2015,6572943;,"RAWSON, RICHARD A;","Rawson, RA"
2056218,25747927,R01DA032733,2015,8253026; 2152680 (contact); 6572943;,"GLASNER-EDWARDS, SUZETTE V; KARNO, MITCHELL P ...","Glasner-edwards, SV"
2056219,25747927,R01DA032733,2015,8253026; 2152680 (contact); 6572943;,"GLASNER-EDWARDS, SUZETTE V; KARNO, MITCHELL P ...","Rawson, RA"


# UPLOAD TO S3 NOT WORKING

In [375]:
df.to_csv("./NIH_postcleaning.csv")
file = open("./NIH_postcleaning.csv", 'r+', encoding='utf-8')
s3_functions.upload_to_s3(file=file,key = 'NIH_postcleaning.csv')
file.close()

NameError: ignored

In [0]:

import boto
def upload_to_s3(aws_access_key_id, aws_secret_access_key, file, bucket, key, callback=None, md5=None, reduced_redundancy=False, content_type=None):
    """
    Uploads the given file to the AWS S3
    bucket and key specified.
    
    callback is a function of the form:
    
    def callback(complete, total)
    
    The callback should accept two integer parameters,
    the first representing the number of bytes that
    have been successfully transmitted to S3 and the
    second representing the size of the to be transmitted
    object.
    
    Returns boolean indicating success/failure of upload.
    """
    try:
        size = os.fstat(file.fileno()).st_size
    except:
        # Not all file objects implement fileno(),
        # so we fall back on this
        file.seek(0, os.SEEK_END)
        size = file.tell()
    
    conn = boto.connect_s3(aws_access_key_id, aws_secret_access_key)
    bucket = conn.get_bucket(bucket, validate=True)
    k = Key(bucket)
    k.key = key
    if content_type:
        k.set_metadata('Content-Type', content_type)
    sent = k.set_contents_from_file(file, cb=callback, md5=md5, reduced_redundancy=reduced_redundancy, rewind=True)
    
    # Rewind for later use
    file.seek(0)
    
    if sent == size:
        return True
    return False

In [380]:
AWS_ACCESS_KEY = 'AKIAIA55FCCZKLRCFQKQ'
AWS_ACCESS_SECRET_KEY = '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'

file = open('./NIH_postcleaning.csv', 'r+')

key = file.name
bucket = 'ayp-data'

if upload_to_s3(AWS_ACCESS_KEY, AWS_ACCESS_SECRET_KEY, file, bucket, key):
    print('It worked!')
else:
    print ('The upload failed...')


It worked!
