In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from eli5.sklearn import InvertableHashingVectorizer
from sklearn.decomposition import PCA

import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

%matplotlib inline



In [47]:
df_train = pd.read_csv('../datasets/df_train_cleaned_shortened.csv')
df_train.sample(n=10000)

Unnamed: 0,uniqueid,drugName,condition,review,rating,date,usefulCount,text length,sentiment_rate,month,year
33780,228941,Tadalafil,Erectile Dysfunction,work well headache side effect,9.0,2016-06-20,4,35,1,6,2016
47801,77683,Cephalexin,Bladder Infection,good but take long duration treatment such day...,9.0,2009-07-29,63,73,1,7,2009
32026,156672,Dulaglutide,"Diabetes, Type 2",started trulicity one month ago daily blood le...,8.0,2016-10-18,6,384,1,10,2016
47455,146279,Suboxone,Opiate Dependence,have severe chronic pain and have had everythi...,2.0,2009-07-16,35,417,0,7,2009
27331,231662,Trazodone,ibromyalgia,have been trazodone for over year doctor precr...,10.0,2015-09-17,43,570,1,9,2015
...,...,...,...,...,...,...,...,...,...,...,...
44898,155491,Metronidazole,Bacterial Vaginitis,first time had wa giving metronidazole take tw...,8.0,2015-03-28,8,422,1,3,2015
27937,216222,Copper,Birth Control,got year copper iud and have never given birth...,9.0,2015-05-14,8,770,1,5,2015
3758,101324,Aubra,Birth Control,received aubra from planned parenthood and fin...,1.0,2016-12-11,1,430,0,12,2016
45589,84998,Ethinyl estradiol / norgestimate,Birth Control,there minus only writing review remind myself ...,1.0,2016-01-21,6,529,0,1,2016


In [48]:
drugName = df_train['drugName'].tolist()
condition = df_train['condition'].tolist()
rating = df_train['rating'].tolist()
reviews = df_train['review'].tolist()

In [49]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [50]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in reviews:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

In [None]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, max_features=4000,
                                 min_df=4, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,2))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(reviews) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

## K-Means

In [None]:
num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(features)

clusters = km.labels_.tolist()

In [None]:
import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [None]:
drugs = {'drugName': drugName, 
         'condition': condition, 
         'cluster': clusters, 
         'rating':rating,
        'reviews':reviews}

In [None]:
frame = pd.DataFrame(drugs, index = [clusters] , columns = ['drugName', 'condition', 'reviews','cluster', 'rating'])

In [None]:
frame['cluster'].value_counts()

In [None]:

grouped = frame['rating'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

grouped.mean() #average rank (1 to 100) per cluster

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.iloc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d condition:" % i, end='')
    for title in frame.iloc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

## Hierarchical Clustering