In [None]:
import json
import nltk
import pandas as pd

from pymetamap import MetaMap

In [None]:
# read the complete list od semantic types from the text file and make a dictionary
semTypesList = pd.read_csv('SemanticTypes_2013AA.txt', sep='|')
semList = semTypesList.set_index('shortSem').to_dict()['LongSem']

In [None]:
# Text Normalization Fuctions
# Tokenization

def identify_tokens(row):
    review = str(row['review'])
    tokens = nltk.word_tokenize(review)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words


# Stemming

from nltk.stem import PorterStemmer
stemming = PorterStemmer()

def stem_list(row):
    my_list = row['words']
    stemmed_list = [stemming.stem(word) for word in my_list]
    return (stemmed_list)


# Remove Stop Words

from nltk.corpus import stopwords
stops = set(stopwords.words("english"))                  

def remove_stops(row):
    my_list = row['words']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)


# Re-join words

def rejoin_words(row):
    my_list = row['stem_meaningful']
    joined_words = ( " ".join(my_list))
    return joined_words

# Stop Words

from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
# Read data

file = '/Users/sammie/JMIR_p1/All_Data_Drugs.com&WebMD_Methadone&Suboxone.xlsx'

raw_df = pd.read_excel(file)

In [None]:
print(len(raw_df))
raw_df.iloc[0]['review']

In [None]:
mm = MetaMap.get_instance('/Users/sammie/JMIR_p1/public_mm/bin/metamap18')

In [None]:
# Normalizing reviews

raw_df['review'] = raw_df['review'].str.lower()

raw_df['words'] = raw_df.apply(identify_tokens, axis=1)

# raw_df['stemmed_words'] = raw_df.apply(stem_list, axis=1)

raw_df['stem_meaningful'] = raw_df.apply(remove_stops, axis=1)

raw_df['processed'] = raw_df.apply(rejoin_words, axis=1)


# Remove non-Ascii

from string import printable

st = set(printable)
raw_df["processed"] = raw_df["processed"].apply(lambda x: ''.join([" " if  i not in  st else i for i in x]))

In [None]:
raw_df

In [None]:
df1 = pd.DataFrame(columns = ['Normalized Text','Identified Phrase','Mapped Standard Phrase', 'Semantic Types' , 'Short Semantic Types'])
df2 = pd.DataFrame(columns = ['Normalized Text','Identified Phrase','Mapped Standard Phrase', 'Semantic Types' , 'Short Semantic Types'])
df3 = pd.DataFrame(columns = ['Normalized Text','Identified Phrase','Mapped Standard Phrase', 'Semantic Types' , 'Short Semantic Types'])
# df4 = pd.DataFrame(columns = ['Normalized Text','Identified Phrase','Mapped Standard Phrase', 'Semantic Types' , 'Short Semantic Types'])

In [None]:
# (len(raw_df))

for indx in range (len(raw_df)):
    print(indx)
    review = raw_df.iloc[indx]['processed']
    concepts,error = mm.extract_mapping_candidates([review], mm_data_version='USAbase')
    if error is None:
        j = concepts.find('{"AllDocuments"')
        data = json.loads(concepts[j:])
        for document in data['AllDocuments']:
            for utter in document['Document']['Utterances']:
                for phrase in utter['Phrases']:
                    # print(phrase)
                    for mapping in phrase['Mappings']:
                        for mappingCandidate in mapping['MappingCandidates']:
                            semTypes = mappingCandidate['SemTypes']

#                             if 'sosy' in semTypes:
#                                 if not any(df['Normalized Text'] == phrase['PhraseText']):
#                                     df = df.append({'Normalized Text':phrase['PhraseText'],'Short Semantic Types':semTypes, \
#                                         'Identified Phrase': mappingCandidate['CandidateMatched'], \
#                                         'Semantic Types' : ', '.join([semList[x] for x in semTypes]) , \
#                                         'Mapped Standard Phrase': mappingCandidate['CandidatePreferred']}, ignore_index = True)
                            if 'mobd' in semTypes:
                                if not any(df1['Normalized Text'] == phrase['PhraseText']):
                                    df1 = df1.append({'Normalized Text':phrase['PhraseText'],'Short Semantic Types':semTypes, \
                                        'Identified Phrase': mappingCandidate['CandidateMatched'], \
                                        'Semantic Types' : ', '.join([semList[x] for x in semTypes]) , \
                                        'Mapped Standard Phrase': mappingCandidate['CandidatePreferred']}, ignore_index = True)

                            if 'dsyn' in semTypes:
                                if not any(df2['Normalized Text'] == phrase['PhraseText']):
                                    df2 = df2.append({'Normalized Text':phrase['PhraseText'],'Short Semantic Types':semTypes, \
                                        'Identified Phrase': mappingCandidate['CandidateMatched'], \
                                        'Semantic Types' : ', '.join([semList[x] for x in semTypes]) , \
                                        'Mapped Standard Phrase': mappingCandidate['CandidatePreferred']}, ignore_index = True)

                            if 'orch' in semTypes or 'clnd' in semTypes:
                                if not any(df3['Normalized Text'] == phrase['PhraseText']):
                                    df3 = df3.append({'Normalized Text':phrase['PhraseText'],'Short Semantic Types':semTypes, \
                                        'Identified Phrase': mappingCandidate['CandidateMatched'], \
                                        'Semantic Types' : ', '.join([semList[x] for x in semTypes]) , \
                                        'Mapped Standard Phrase': mappingCandidate['CandidatePreferred']}, ignore_index = True)

    else:
        print('error in', indx)

In [None]:
df1.to_csv('/Users/sammie/JMIR_p1/MetaMap_Results/result_mobd.csv')
df2.to_csv('/Users/sammie/JMIR_p1/MetaMap_Results/result_dsyn.csv')
df3.to_csv('/Users/sammie/JMIR_p1/MetaMap_Results/result_orch_clnd.csv')