In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os

import nltk
import regex as re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import string
import spacy

In [4]:
path = '../BDS_project'
train_path = path + '/drug_train.tsv'
test_path = path + '/drug_test.tsv'

data_train = pd.read_csv(train_path, delimiter = '\t')
data_test = pd.read_csv(test_path, delimiter = '\t')

data_train.drop(columns = ['Unnamed: 0'], inplace = True)
data_test.drop(columns = ['Unnamed: 0'], inplace = True)

data = pd.concat([data_train,data_test])
data.reset_index(inplace=True,drop=True)

data['review_sentiment'] = data['rating'].apply(lambda x: 1 if x > 5 else 0)

In [None]:
# From Perfume Recommendation and Health Recommendation notebooks
def preprocess_text(text):
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = text.replace('\n',' ')
    text = text.replace('\r',' ')
    text = text.replace('\t',' ')
    text = text.replace('-',' ')
    text = text.replace("/",' ')
    text = text.replace(">",' ')
    text = text.replace('"',' ')
    text = text.replace('?',' ')
    return text

In [None]:
import nltk
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
stop_words.remove('no')

def nlp_preprocessing(review):
    if type(review) is not int:
        string = ""
        review = preprocess_text(review)
        review = re.sub('[^a-zA-Z]', ' ', review)

        review = re.sub('\s+',' ', review)

        review = review.lower()

        for word in review.split():

            if not word in stop_words:
                word = stemmer.stem(word)
                string += word + " "

        return string

In [None]:
data['cleaned_review'] = data['review'].apply(nlp_preprocessing)
data['drugName'] = data['drugName'].apply(lambda x:x.lower()) 
data['condition'] = data['condition'].apply(lambda x:x.lower())

In [None]:
sid = SentimentIntensityAnalyzer()
data['sentiment_score'] = [sid.polarity_scores(v)['compound'] for v in data['review']]
data['sentiment_score_clean'] = [sid.polarity_scores(v)['compound'] for v in data['cleaned_review']]

In [None]:
data = data.dropna(axis=0)
data.reset_index(inplace=True,drop=True)
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year

In [None]:
stop_words = set(stopwords.words('english'))
data['word_count']=data["cleaned_review"].apply(lambda x: len(str(x).split()))
data['unique_word_count']=data["cleaned_review"].apply(lambda x: len(set(str(x).split())))
data['char_length']=data["cleaned_review"].apply(lambda x: len(str(x)))
data["count_punctuations"] = data["review"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
data["stopword_count"] = data["review"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))
data["mean_word_len"] = data["cleaned_review"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [None]:
nlp = spacy.load("en_core_web_sm")
def subj_obj_count(review):

    sent = review
    doc=nlp(sent)
    sub_words = set([str(word) for word in doc if (word.dep_ == "nsubj")])

    obj_words = set([str(word) for word in doc if (word.dep_ == "dobj")])

    return len(sub_words),len(obj_words)

from tqdm import tqdm
count = []

for r in tqdm(data['review']):
    count.append(subj_obj_count(r))

sub_obj = pd.DataFrame(count,columns=['subj_count','obj_count'])

In [None]:
ner_lst = nlp.pipe_labels['ner']

def ner(review):
    sent = review
    doc=nlp(sent)
    dic = {}.fromkeys(ner_lst,0)
    for word in doc.ents:
        dic[word.label_]+=1

    return dic
entity = pd.DataFrame([ner(r) for r in tqdm(data['cleaned_review'])])

In [None]:
import gensim

corpus = data['cleaned_review']

lst_corpus = []
for string in tqdm(corpus):
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i + 1]) for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

id2word = gensim.corpora.Dictionary(lst_corpus)
dic_corpus = [id2word.doc2bow(word) for word in lst_corpus]
lda_model = gensim.models.ldamodel.LdaModel(corpus=dic_corpus, id2word=id2word, num_topics=20, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

train_vecs = []
for i in range(len(corpus)):
    top_topics = (
        lda_model.get_document_topics(dic_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(20)]

    train_vecs.append(topic_vec)
topics = pd.DataFrame(train_vecs)

In [None]:
data = pd.concat([data,sub_obj,entity,topics],axis=1)
data.to_csv('final_new_data_processed.csv',index=False)