### Importing Necessary Libraries and Modules

In [1]:
import pandas as pd
import warnings
from bs4 import MarkupResemblesLocatorWarning, BeautifulSoup
from sklearn.exceptions import ConvergenceWarning
import re

### Suppressing MarkupResemblesLocatorWarning, FutureWarning and ConvergenceWarning

In [2]:
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

### Importing Text Processing Library NLTK

In [3]:
import nltk

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('vader_lexicon')


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Data Preprocessing

##### Reading the datasets

In [4]:
dtypes = { 'Unnamed: 0': 'int32', 'drugName': 'category', 'condition': 'category', 'review': 'category', 'rating': 'float16', 'date': 'category', 'usefulCount': 'int16' }

train_df = pd.read_csv('C:\\Users\\harsh\\Desktop\\ML Projects\\GSSOC\\datasets\\drugsComTrain_raw.tsv', sep='\t', dtype=dtypes)
test_df = pd.read_csv("C:\\Users\\harsh\\Desktop\\ML Projects\\GSSOC\\datasets\\drugsComTest_raw.tsv", sep='\t', dtype=dtypes)

##### Decoding HTML-encoded characters

In [5]:
def decode_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def remove_special_characters(text):
    return re.sub('[^a-zA-Z0-9\s]', '', text)

train_df['review'] = train_df['review'].apply(decode_html)
train_df['review'] = train_df['review'].apply(remove_special_characters)

test_df['review'] = test_df['review'].apply(decode_html)
test_df['review'] = test_df['review'].apply(remove_special_characters)

In [6]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,It has no side effect I take it in combination...,9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,"November 27, 2016",37


In [7]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,Ive tried a few antidepressants over the years...,10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance",My son has Crohns disease and has done very we...,8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,Quick reduction of symptoms,9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,Contrave combines drugs that were used for alc...,9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,I have been on this birth control for one cycl...,9.0,"October 22, 2015",4


##### Dropping unnamed and date columns from both datasets

In [8]:
train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
test_df.drop(['Unnamed: 0'], axis=1, inplace=True)

train_df.drop(['date'], axis=1, inplace=True)
test_df.drop(['date'], axis=1, inplace=True)

In [9]:
train_df.head()

Unnamed: 0,drugName,condition,review,rating,usefulCount
0,Valsartan,Left Ventricular Dysfunction,It has no side effect I take it in combination...,9.0,27
1,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,192
2,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,17
3,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,10
4,Buprenorphine / naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,37


In [10]:
test_df.head()

Unnamed: 0,drugName,condition,review,rating,usefulCount
0,Mirtazapine,Depression,Ive tried a few antidepressants over the years...,10.0,22
1,Mesalamine,"Crohn's Disease, Maintenance",My son has Crohns disease and has done very we...,8.0,17
2,Bactrim,Urinary Tract Infection,Quick reduction of symptoms,9.0,3
3,Contrave,Weight Loss,Contrave combines drugs that were used for alc...,9.0,35
4,Cyclafem 1 / 35,Birth Control,I have been on this birth control for one cycl...,9.0,4


##### Adding 4 new columns having sentiment analysis values (pos, neg, neu, comp), i.e, Positive Score, Negative Score, Neutral Score and Compound Score respectively

In [11]:
# Add new column to train and test dataframes having float value 0.0 in all rows

train_df['pos'] = 0.0
test_df['pos'] = 0.0

train_df['neg'] = 0.0
test_df['neg'] = 0.0

train_df['neu'] = 0.0
test_df['neu'] = 0.0

train_df['comp'] = 0.0
test_df['comp'] = 0.0

In [12]:
train_df.head()

Unnamed: 0,drugName,condition,review,rating,usefulCount,pos,neg,neu,comp
0,Valsartan,Left Ventricular Dysfunction,It has no side effect I take it in combination...,9.0,27,0.0,0.0,0.0,0.0
1,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,192,0.0,0.0,0.0,0.0
2,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,17,0.0,0.0,0.0,0.0
3,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,10,0.0,0.0,0.0,0.0
4,Buprenorphine / naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,37,0.0,0.0,0.0,0.0


In [13]:
test_df.head()

Unnamed: 0,drugName,condition,review,rating,usefulCount,pos,neg,neu,comp
0,Mirtazapine,Depression,Ive tried a few antidepressants over the years...,10.0,22,0.0,0.0,0.0,0.0
1,Mesalamine,"Crohn's Disease, Maintenance",My son has Crohns disease and has done very we...,8.0,17,0.0,0.0,0.0,0.0
2,Bactrim,Urinary Tract Infection,Quick reduction of symptoms,9.0,3,0.0,0.0,0.0,0.0
3,Contrave,Weight Loss,Contrave combines drugs that were used for alc...,9.0,35,0.0,0.0,0.0,0.0
4,Cyclafem 1 / 35,Birth Control,I have been on this birth control for one cycl...,9.0,4,0.0,0.0,0.0,0.0


##### Sentiment Analysis using 'nltk' library

In [14]:
analyzer = SentimentIntensityAnalyzer()
spell = SpellChecker()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # lowercase the text
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    
    # words = word_tokenize(text)
    # corrected_words = [spell.correction(word) for word in words]
    
    # try:
    #     text = ' '.join(corrected_words)
    # except:
    #     text = ' '

    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

def classify_sentiment(text):
    preprocessed_text = preprocess_text(text)
    
    sentiment_scores = analyzer.polarity_scores(preprocessed_text)
    
    pos_score = sentiment_scores['pos']
    neg_score = sentiment_scores['neg']
    neu_score = sentiment_scores['neu']
    compound_score = sentiment_scores['compound']
    
    return pos_score, neg_score, neu_score, compound_score

for index, row in train_df.iterrows():
    pos_score, neg_score, neu_score, compound_score = classify_sentiment(row['review'])
    
    train_df.at[index, 'pos'] = pos_score
    train_df.at[index, 'neg'] = neg_score
    train_df.at[index, 'neu'] = neu_score
    train_df.at[index, 'comp'] = compound_score
    
    if index % 1000 == 0:
        print("train_df: ", index)

print("train_df completed!")

train_df.drop(['review'], axis=1, inplace=True)

for index, row in test_df.iterrows():
    pos_score, neg_score, neu_score, compound_score = classify_sentiment(row['review'])
    
    test_df.at[index, 'pos'] = pos_score
    test_df.at[index, 'neg'] = neg_score
    test_df.at[index, 'neu'] = neu_score
    test_df.at[index, 'comp'] = compound_score
    
    if index % 1000 == 0:
        print("test_df: ", index)
        
print("test_df completed!")
    
test_df.drop(['review'], axis=1, inplace=True)

train_df:  0
train_df:  1000
train_df:  2000
train_df:  3000
train_df:  4000
train_df:  5000
train_df:  6000
train_df:  7000
train_df:  8000
train_df:  9000
train_df:  10000
train_df:  11000
train_df:  12000
train_df:  13000
train_df:  14000
train_df:  15000
train_df:  16000
train_df:  17000
train_df:  18000
train_df:  19000
train_df:  20000
train_df:  21000
train_df:  22000
train_df:  23000
train_df:  24000
train_df:  25000
train_df:  26000
train_df:  27000
train_df:  28000
train_df:  29000
train_df:  30000
train_df:  31000
train_df:  32000
train_df:  33000
train_df:  34000
train_df:  35000
train_df:  36000
train_df:  37000
train_df:  38000
train_df:  39000
train_df:  40000
train_df:  41000
train_df:  42000
train_df:  43000
train_df:  44000
train_df:  45000
train_df:  46000
train_df:  47000
train_df:  48000
train_df:  49000
train_df:  50000
train_df:  51000
train_df:  52000
train_df:  53000
train_df:  54000
train_df:  55000
train_df:  56000
train_df:  57000
train_df:  58000
train_df: 

In [15]:
train_df.head()

Unnamed: 0,drugName,condition,rating,usefulCount,pos,neg,neu,comp
0,Valsartan,Left Ventricular Dysfunction,9.0,27,0.0,0.0,1.0,0.0
1,Guanfacine,ADHD,8.0,192,0.215,0.035,0.75,0.9169
2,Lybrel,Birth Control,5.0,17,0.133,0.039,0.828,0.8176
3,Ortho Evra,Birth Control,8.0,10,0.161,0.113,0.726,0.5267
4,Buprenorphine / naloxone,Opiate Dependence,9.0,37,0.291,0.105,0.604,0.9403


In [16]:
test_df.head()

Unnamed: 0,drugName,condition,rating,usefulCount,pos,neg,neu,comp
0,Mirtazapine,Depression,10.0,22,0.281,0.159,0.56,0.7623
1,Mesalamine,"Crohn's Disease, Maintenance",8.0,17,0.193,0.073,0.733,0.5574
2,Bactrim,Urinary Tract Infection,9.0,3,0.0,0.0,1.0,0.0
3,Contrave,Weight Loss,9.0,35,0.177,0.099,0.723,0.6548
4,Cyclafem 1 / 35,Birth Control,9.0,4,0.25,0.027,0.723,0.9529


##### One hot encoding drugName and Condition Column

In [17]:
drugNametoIndex = {}
ind = 0

for index, drugName in enumerate(train_df['drugName'].unique()):
    drugNametoIndex[drugName] = ind
    ind += 1
for index, drugName in enumerate(test_df['drugName'].unique()):
    if (drugName not in drugNametoIndex.keys()):
        drugNametoIndex[drugName] = ind
        ind += 1

train_df['drugName'] = train_df['drugName'].apply(lambda x: drugNametoIndex[x])
test_df['drugName'] = test_df['drugName'].apply(lambda x: drugNametoIndex[x])

In [18]:
condition_to_int = {}
ind = 0

for index, condition in enumerate(train_df['condition'].unique()):
    condition_to_int[condition] = ind
    ind += 1
for index, condition in enumerate(test_df['condition'].unique()):
    if (condition not in condition_to_int.keys()):
        condition_to_int[condition] = ind
        ind += 1

train_df['condition'] = train_df['condition'].apply(lambda x: condition_to_int[x])
test_df['condition'] = test_df['condition'].apply(lambda x: condition_to_int[x])

In [19]:
train_df.head()

Unnamed: 0,drugName,condition,rating,usefulCount,pos,neg,neu,comp
0,0,0,9.0,27,0.0,0.0,1.0,0.0
1,1,1,8.0,192,0.215,0.035,0.75,0.9169
2,2,2,5.0,17,0.133,0.039,0.828,0.8176
3,3,2,8.0,10,0.161,0.113,0.726,0.5267
4,4,3,9.0,37,0.291,0.105,0.604,0.9403


In [20]:
test_df.head()

Unnamed: 0,drugName,condition,rating,usefulCount,pos,neg,neu,comp
0,91,9,10.0,22,0.281,0.159,0.56,0.7623
1,425,240,8.0,17,0.193,0.073,0.733,0.5574
2,351,13,9.0,3,0.0,0.0,1.0,0.0
3,39,74,9.0,35,0.177,0.099,0.723,0.6548
4,1339,2,9.0,4,0.25,0.027,0.723,0.9529


##### Saving the modified DF to csv file

In [21]:
train_df.to_csv('C:\\Users\\harsh\\Desktop\\ML Projects\\GSSOC\\datasets_generated\\train_df.csv', index=False)
test_df.to_csv('C:\\Users\\harsh\\Desktop\\ML Projects\\GSSOC\\datasets_generated\\test_df.csv', index=False)

##### Opening Saved CSV files

In [22]:
train_df = pd.read_csv('C:\\Users\\harsh\\Desktop\\ML Projects\\GSSOC\\datasets_generated\\train_df.csv')
test_df = pd.read_csv('C:\\Users\\harsh\\Desktop\\ML Projects\\GSSOC\\datasets_generated\\test_df.csv')