In [1]:
import os
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk1.8.0_271"
os.environ["HADOOP_HOME"] = "C:\Installations\Hadoop"
os.environ["SPARK_HOME"] = "D:\spark-2.4.5-bin-hadoop2.7\spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
from pyspark import SparkFiles
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "10g") \
    .appName("CloudETLProject") \
    .getOrCreate()

In [4]:
import pandas as pd
pd_test = pd.read_csv('../cleaned_nlp_data/testfinalmultirating.csv', sep=',')
pd_train = pd.read_csv('../cleaned_nlp_data/trainfinalmultirating.csv', sep=',')

In [5]:
mySchema = StructType([ StructField("uniqueID", StringType(), True)\
                       ,StructField("drugName", StringType(), True)\
                       ,StructField("condition", StringType(), True)\
                       ,StructField("review", StringType(), True)\
                       ,StructField("rating", IntegerType(), True)\
                       ,StructField("date", StringType(), True)\
                       ,StructField("usefulCount", StringType(), True)\
                     ])

In [6]:
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

df_test = sqlContext.createDataFrame(pd_test, schema=mySchema)
df_train = sqlContext.createDataFrame(pd_train, schema=mySchema)

In [7]:
drop_df = df_test.drop('uniqueID','drugName','condition', 'date', 'usefulCount' ).collect()
drop_df_two = df_train.drop('uniqueID','drugName','condition', 'date', 'usefulCount' ).collect()

In [8]:
test_df = spark.createDataFrame(drop_df)
test_df.show()

+--------------------+------+
|              review|rating|
+--------------------+------+
|gave me rapid hea...|     1|
|    it cured my mrsa|     9|
|i have been on zy...|     7|
|it didnt work as ...|     6|
|i have had  major...|     9|
|i had mrsa inf la...|    10|
|i got a mrsa stap...|     8|
|very satisfied wi...|     9|
|effectiveness las...|     4|
|my psa was going ...|    10|
|on zytiga for  mo...|    10|
|began zytiga with...|    10|
|had tried clariti...|     8|
|this medicine wor...|    10|
|i have had cholin...|     6|
|after travelling ...|     9|
|i suffered from m...|    10|
|i recently had te...|     9|
|it works great fo...|     9|
|had hives nearly ...|    10|
+--------------------+------+
only showing top 20 rows



In [9]:
train_df = spark.createDataFrame(drop_df_two)
train_df.show()

+--------------------+------+
|              review|rating|
+--------------------+------+
|it has no side ef...|     9|
|my son is halfway...|     8|
|i used to take an...|     5|
|this is my first ...|     8|
|suboxone has comp...|     9|
|nd day on mg star...|     2|
|he pulled out but...|     1|
|abilify changed m...|    10|
| i ve had  nothin...|     1|
|i had been on the...|     8|
|i have been on th...|     9|
|i have taken anti...|    10|
|i had crohns with...|     4|
|have a little bit...|     4|
|started nexplanon...|     3|
|i have been takin...|     9|
|this drug worked ...|     9|
|ive been taking a...|     9|
|ive been on every...|    10|
|i have been on ta...|    10|
+--------------------+------+
only showing top 20 rows



In [10]:
def build_trigrams(inputCol=["review","rating"], n=3):
    tokenizer = [Tokenizer(inputCol="review", outputCol="words")]

# Get rid of stop words
    stopremove = [StopWordsRemover(inputCol='words',outputCol='stop_tokens')]

# Stem the words

# Creates a column for every word, two and three words. n=3
    ngrams = [
        NGram(n=i, inputCol="stop_tokens", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]
# Min term frequency = how many times does it occur in review
# df - times drug occurs in document 
    cv = [
        CountVectorizer(vocabSize=2**15,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]
# cv and idf act as a 
    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
# stringindexer 
    label_stringIdx = [StringIndexer(inputCol = "rating", outputCol = "label")]
    # selector = [ChiSqSelector(numTopFeatures=50,featuresCol='rawFeatures', outputCol="features")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + stopremove + ngrams + cv + idf + assembler + label_stringIdx + lr)

In [11]:
# run the model 
trigram_pipelineFit = build_trigrams().fit(train_df)
test_results = trigram_pipelineFit.transform(test_df)

In [12]:
# classification report - look for false positive, false negative. 
# use a different evaluator to try
predictions = test_results.select(col("label").cast("Float"),col("prediction"))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy = %g" % accuracy)

Model Accuracy = 0.7257


In [13]:
trigram_pipelineFit.save("lr_multiclass")

In [15]:
train_df.groupBy("rating").count().orderBy(desc("count")).show()

+------+-----+
|rating|count|
+------+-----+
|    10|50989|
|     9|27531|
|     1|21619|
|     8|18890|
|     7| 9456|
|     5| 8013|
|     2| 6931|
|     3| 6513|
|     6| 6343|
|     4| 5012|
+------+-----+

