In [1]:
import os
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk1.8.0_271"
os.environ["HADOOP_HOME"] = "C:\Installations\Hadoop"
os.environ["SPARK_HOME"] = "D:\spark-2.4.5-bin-hadoop2.7\spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
from pyspark import SparkFiles
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "10g") \
    .appName("CloudETLProject") \
    .getOrCreate()

In [7]:
import pandas as pd
pd_test = pd.read_csv('../cleaned_nlp_data/testfinal.csv', sep=',')
pd_train = pd.read_csv('../cleaned_nlp_data/trainfinal.csv', sep=',')

In [8]:
mySchema = StructType([ StructField("uniqueID", StringType(), True)\
                       ,StructField("drugName", StringType(), True)\
                       ,StructField("condition", StringType(), True)\
                       ,StructField("review", StringType(), True)\
                       ,StructField("rating", IntegerType(), True)\
                       ,StructField("date", StringType(), True)\
                       ,StructField("usefulCount", StringType(), True)\
                     ])

In [9]:
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

df_test = sqlContext.createDataFrame(pd_test, schema=mySchema)
df_train = sqlContext.createDataFrame(pd_train, schema=mySchema)

In [18]:
drop_df = df_test.drop('uniqueID','drugName','condition', 'date', 'usefulCount' ).collect()
drop_df_two = df_train.drop('uniqueID','drugName','condition', 'date', 'usefulCount' ).collect()

In [19]:
test_df = spark.createDataFrame(drop_df)
test_df.show()

+--------------------+------+
|              review|rating|
+--------------------+------+
|gave me rapid hea...|     0|
|    it cured my mrsa|     1|
|i have been on zy...|     1|
|it didnt work as ...|     1|
|i have had  major...|     1|
|i had mrsa inf la...|     1|
|i got a mrsa stap...|     1|
|very satisfied wi...|     1|
|effectiveness las...|     0|
|my psa was going ...|     1|
|on zytiga for  mo...|     1|
|began zytiga with...|     1|
|had tried clariti...|     1|
|this medicine wor...|     1|
|i have had cholin...|     1|
|after travelling ...|     1|
|i suffered from m...|     1|
|i recently had te...|     1|
|it works great fo...|     1|
|had hives nearly ...|     1|
+--------------------+------+
only showing top 20 rows



In [20]:
train_df = spark.createDataFrame(drop_df_two)
train_df.show()

+--------------------+------+
|              review|rating|
+--------------------+------+
|it has no side ef...|     1|
|my son is halfway...|     1|
|i used to take an...|     0|
|this is my first ...|     1|
|suboxone has comp...|     1|
|nd day on mg star...|     0|
|he pulled out but...|     0|
|abilify changed m...|     1|
| i ve had  nothin...|     0|
|i had been on the...|     1|
|i have been on th...|     1|
|i have taken anti...|     1|
|i had crohns with...|     0|
|have a little bit...|     0|
|started nexplanon...|     0|
|i have been takin...|     1|
|this drug worked ...|     1|
|ive been taking a...|     1|
|ive been on every...|     1|
|i have been on ta...|     1|
+--------------------+------+
only showing top 20 rows



In [21]:
def build_trigrams(inputCol=["review","rating"], n=3):
    tokenizer = [Tokenizer(inputCol="review", outputCol="words")]

# Get rid of stop words
    stopremove = [StopWordsRemover(inputCol='words',outputCol='stop_tokens')]

# Stem the words

# Creates a column for every word, two and three words. n=3
    ngrams = [
        NGram(n=i, inputCol="stop_tokens", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]
# Min term frequency = how many times does it occur in review
# df - times drug occurs in document 
    cv = [
        CountVectorizer(vocabSize=2**15,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]
# cv and idf act as a 
    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
# stringindexer 
    label_stringIdx = [StringIndexer(inputCol = "rating", outputCol = "label")]
    # selector = [ChiSqSelector(numTopFeatures=50,featuresCol='rawFeatures', outputCol="features")]
    nb = [NaiveBayes(smoothing=1)]
    return Pipeline(stages=tokenizer + stopremove + ngrams + cv + idf + assembler + label_stringIdx + nb)

In [22]:
# run the model 
trigram_pipelineFit = build_trigrams().fit(train_df)
test_results = trigram_pipelineFit.transform(test_df)

In [23]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()

accuracy = test_results.filter(test_results.label == test_results.prediction).count() / float(test_results.count())
# Apply a confusion matrix for TP, TN and FN. 
roc_auc = evaluator.evaluate(test_results)
print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8300
ROC-AUC: 0.4123


In [24]:
trigram_pipelineFit.save("binary_naive_bayes")