In [1]:
# connecting to spark
from pyspark import SparkConf, SparkContext
## set up spark context
conf = SparkConf().setAppName("myApp")
sc = SparkContext(conf=conf)

# create sparksession object
from pyspark.sql import SparkSession
sparksession = SparkSession(sc)

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DoubleType
import preproc as pp

In [3]:
# Register all the functions in Preproc with Spark Context
check_lang_udf = udf(pp.check_lang, StringType())
remove_stops_udf = udf(pp.remove_stops, StringType())
remove_features_udf = udf(pp.remove_features, StringType())
tag_and_remove_udf = udf(pp.tag_and_remove, StringType())
lemmatize_udf = udf(pp.lemmatize, StringType())
check_blanks_udf = udf(pp.check_blanks, StringType())
string_to_float_udf = udf(pp.string_to_float, DoubleType())

In [4]:
df = sparksession.read.csv("data/comcastcomplaints/comcast2000.csv",
                           header=True,
                          inferSchema=True)

In [5]:
df.show(5)
df.count()


+--------------------+-------------+------+--------------------+
|              author|    posted_on|rating|                text|
+--------------------+-------------+------+--------------------+
|Alantae of Cheste...|Nov. 22, 2016|     1|I used to love Co...|
|Vera of Philadelp...|Nov. 19, 2016|     1|I'm so over Comca...|
|Sarah of Rancho C...|Nov. 17, 2016|     1|If I could give t...|
|Dennis of Manches...|Nov. 16, 2016|     1|I've had the wors...|
|Ryan of Bellevue, WA|Nov. 14, 2016|     1|Check your contra...|
+--------------------+-------------+------+--------------------+
only showing top 5 rows



2000

In [6]:
# predict language and filter out those with less than 90% chance of being English
#lang_df = df.withColumn("lang", check_lang_udf(df['text']))
#lang_df.show(5)
#en_df = lang_df.filter(lang_df["lang"] == "en")

en_df = df.filter(df.text != '')


In [7]:
# remove stop words to reduce dimensionality
rm_stops_df = en_df.withColumn("stop_text", remove_stops_udf("text"))
rm_stops_df.show(5)

+--------------------+-------------+------+--------------------+--------------------+
|              author|    posted_on|rating|                text|           stop_text|
+--------------------+-------------+------+--------------------+--------------------+
|Alantae of Cheste...|Nov. 22, 2016|     1|I used to love Co...|I used love Comca...|
|Vera of Philadelp...|Nov. 19, 2016|     1|I'm so over Comca...|I'm Comcast! The ...|
|Sarah of Rancho C...|Nov. 17, 2016|     1|If I could give t...|If I could give n...|
|Dennis of Manches...|Nov. 16, 2016|     1|I've had the wors...|I've worst experi...|
|Ryan of Bellevue, WA|Nov. 14, 2016|     1|Check your contra...|Check contract si...|
+--------------------+-------------+------+--------------------+--------------------+
only showing top 5 rows



In [8]:
#remove stop words to reduce dimensionality

rm_stops_df = en_df.withColumn("stop_text", remove_stops_udf("text"))
rm_stops_df.show(5)


+--------------------+-------------+------+--------------------+--------------------+
|              author|    posted_on|rating|                text|           stop_text|
+--------------------+-------------+------+--------------------+--------------------+
|Alantae of Cheste...|Nov. 22, 2016|     1|I used to love Co...|I used love Comca...|
|Vera of Philadelp...|Nov. 19, 2016|     1|I'm so over Comca...|I'm Comcast! The ...|
|Sarah of Rancho C...|Nov. 17, 2016|     1|If I could give t...|If I could give n...|
|Dennis of Manches...|Nov. 16, 2016|     1|I've had the wors...|I've worst experi...|
|Ryan of Bellevue, WA|Nov. 14, 2016|     1|Check your contra...|Check contract si...|
+--------------------+-------------+------+--------------------+--------------------+
only showing top 5 rows



In [9]:
# remove other non essential words, think of it as my personal stop word list
rm_features_df = rm_stops_df.withColumn("feat_text", remove_features_udf(rm_stops_df["stop_text"]))
rm_features_df.show(5)

+--------------------+-------------+------+--------------------+--------------------+--------------------+
|              author|    posted_on|rating|                text|           stop_text|           feat_text|
+--------------------+-------------+------+--------------------+--------------------+--------------------+
|Alantae of Cheste...|Nov. 22, 2016|     1|I used to love Co...|I used love Comca...|  used love comca...|
|Vera of Philadelp...|Nov. 19, 2016|     1|I'm so over Comca...|I'm Comcast! The ...|   comcast the wo...|
|Sarah of Rancho C...|Nov. 17, 2016|     1|If I could give t...|If I could give n...|   could give neg...|
|Dennis of Manches...|Nov. 16, 2016|     1|I've had the wors...|I've worst experi...|   worst experien...|
|Ryan of Bellevue, WA|Nov. 14, 2016|     1|Check your contra...|Check contract si...|check contract si...|
+--------------------+-------------+------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [10]:
# tag the words remaining and keep only Nouns, Verbs and Adjectives
tagged_df = rm_features_df.withColumn("tagged_text", tag_and_remove_udf(rm_features_df["feat_text"]))
tagged_df.select(['tagged_text']).show(5)

+--------------------+
|         tagged_text|
+--------------------+
| used love comcas...|
| comcast worst in...|
| give negative st...|
| worst experience...|
| check contract s...|
+--------------------+
only showing top 5 rows



In [11]:
# lemmatization of remaining words to reduce dimensionality & boost measures
lemm_df = tagged_df.withColumn("lemm_text", lemmatize_udf(tagged_df["tagged_text"]))
lemm_df.select(['lemm_text']).show(5)

+--------------------+
|           lemm_text|
+--------------------+
|use love comcast ...|
|comcast worst int...|
|give negative sta...|
|worst experience ...|
|check contract si...|
+--------------------+
only showing top 5 rows



In [12]:
# ========remove all rows containing only blank spaces============
check_blanks_df_tmp = lemm_df.withColumn("is_blank", check_blanks_udf(lemm_df["lemm_text"]))
check_blanks_df_tmp.show(5)

+--------------------+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+
|              author|    posted_on|rating|                text|           stop_text|           feat_text|         tagged_text|           lemm_text|is_blank|
+--------------------+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+
|Alantae of Cheste...|Nov. 22, 2016|     1|I used to love Co...|I used love Comca...|  used love comca...| used love comcas...|use love comcast ...|   False|
|Vera of Philadelp...|Nov. 19, 2016|     1|I'm so over Comca...|I'm Comcast! The ...|   comcast the wo...| comcast worst in...|comcast worst int...|   False|
|Sarah of Rancho C...|Nov. 17, 2016|     1|If I could give t...|If I could give n...|   could give neg...| give negative st...|give negative sta...|   False|
|Dennis of Manches...|Nov. 16, 2016|     1|I've had 

In [13]:
no_blanks_df = check_blanks_df_tmp

# rename columns df.withColumnRenamed("colName", "newColName")
rename_df= no_blanks_df.withColumn("lemm_text", no_blanks_df["text"]).\
             withColumn("label", string_to_float_udf("rating"))


In [14]:
# dedupe important since alot of the tweets only differed by url's and RT mentions
dedup_df = rename_df.dropDuplicates(['author', 'text'])

dedup_df.show(5)
type(dedup_df)

+--------------------+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+-----+
|              author|    posted_on|rating|                text|           stop_text|           feat_text|         tagged_text|           lemm_text|is_blank|label|
+--------------------+-------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+-----+
|Amie of Richmond, VA|        38111|     0|Comcast has been ...|Comcast constant ...|comcast constant ...| comcast constant...|Comcast has been ...|   False|  0.0|
|Ana-Maria of Cora...|Dec. 11, 2015|     1|"I pay $140 month...|"I pay $140 month...|  pay monthly int...| pay monthly inte...|"I pay $140 month...|   False|  1.0|
|Charles of Mt Ple...|Oct. 26, 2015|     1|"In July of 2015 ...|"In July 2015 I m...|  july  moved bel...| moved bellevue p...|"In July of 2015 ...|   False|  1.0|
|Cynthia of Shre

pyspark.sql.dataframe.DataFrame

In [15]:
# select only the columns we care about
data_set = dedup_df.select(["author","label","text"])

data_set.show(5)

+--------------------+-----+--------------------+
|              author|label|                text|
+--------------------+-----+--------------------+
|Stephen of Miami, FL|  1.0|I am writing on b...|
|Ana-Maria of Cora...|  1.0|"I pay $140 month...|
|Charles of Mt Ple...|  1.0|"In July of 2015 ...|
|Jason of Rancoh M...|  1.0|I attempted to in...|
|Federico of Chica...|  1.0|Comcast charged m...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [16]:
# split training & validation sets with 60% to training and use a seed value of 1987
(training_df,test_df) = data_set.randomSplit([0.6, 0.4])
training_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- label: double (nullable = true)
 |-- text: string (nullable = true)



In [17]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import RandomForestClassifier

In [18]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf")

In [23]:
#nb = NaiveBayes()
#pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, rf])
#rf = RandomForestClassifier(numTrees=100,maxDepth=20, seed=42)
rf = RandomForestClassifier()
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, rf])

In [24]:
#paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,[4,8,10]).\
                    addGrid(rf.impurity, ['entropy','gini']).build()


cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)
                    

#training_df.show(5)  
cvModel = cv.fit(training_df)

In [36]:
temp_path = '/Users/wenqiangfeng/Dropbox/Spark/Code/model'
modelPath = temp_path + "/Comcast_model"
bestModel = cvModel.bestModel
bestModel.save(modelPath)

In [37]:
cvModel = bestModel.load(modelPath)

In [32]:
prediction = cvModel.transform(test_df)

In [33]:
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(cvModel.transform(training_df))

0.6688963210702342

In [34]:
evaluator.evaluate(cvModel.transform(test_df))

0.6876447534728403