In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [3]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://trumphillary.s3.us-east-2.amazonaws.com/file_name.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("file_name.csv"), sep=",", header=True)

# Show DataFrame
df.show()
df=df.select(['handle', 'text'])
df.show()
df=df.na.drop()
df.show()

+-------------------+------------------+---------------+--------------------+----------+---------------+-----------------------+---------------------+-------------------+---------------+----+-------------+--------------+---------+--------+--------+---------------+----------+----------+------------------+-------------+----------------------+----------------+------------------+--------------------+---------+--------------------+--------------------+
|               time|                id|         handle|                text|is_retweet|original_author|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_user_id|is_quote_status|lang|retweet_count|favorite_count|longitude|latitude|place_id|place_full_name|place_name|place_type|place_country_code|place_country|place_contained_within|place_attributes|place_bounding_box|          source_url|truncated|            entities|   extended_entities|
+-------------------+------------------+---------------+--------------------+----------+--------

In [4]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
data_df = df.withColumn('length', length(df['text']))
data_df.show()

+---------------+--------------------+------+
|         handle|                text|length|
+---------------+--------------------+------+
| HillaryClinton|The question in t...|    95|
| HillaryClinton|If we stand toget...|    76|
| HillaryClinton|Both candidates w...|    95|
|realDonaldTrump|Join me for a 3pm...|    89|
| HillaryClinton|This election is ...|   109|
| HillaryClinton|When Donald Trump...|    42|
|realDonaldTrump|Once again we wil...|    86|
|realDonaldTrump|Hillary Clintons ...|    84|
|realDonaldTrump|CNBC Time magazin...|   101|
| HillaryClinton|Donald Trump lied...|   106|
|realDonaldTrump|Great afternoon i...|   102|
|realDonaldTrump|In the last 24 hr...|   132|
| HillaryClinton|She gained about ...|   106|
| HillaryClinton|Its NationalVoter...|    65|
| HillaryClinton|I love this count...|   116|
| HillaryClinton|We dont want to t...|   130|
| HillaryClinton|What we hear from...|   108|
| HillaryClinton|One candidate mad...|   111|
| HillaryClinton|What kind of a pe

### Feature Transformations


In [5]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='handle',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [6]:
data_df.show()

+---------------+--------------------+------+
|         handle|                text|length|
+---------------+--------------------+------+
| HillaryClinton|The question in t...|    95|
| HillaryClinton|If we stand toget...|    76|
| HillaryClinton|Both candidates w...|    95|
|realDonaldTrump|Join me for a 3pm...|    89|
| HillaryClinton|This election is ...|   109|
| HillaryClinton|When Donald Trump...|    42|
|realDonaldTrump|Once again we wil...|    86|
|realDonaldTrump|Hillary Clintons ...|    84|
|realDonaldTrump|CNBC Time magazin...|   101|
| HillaryClinton|Donald Trump lied...|   106|
|realDonaldTrump|Great afternoon i...|   102|
|realDonaldTrump|In the last 24 hr...|   132|
| HillaryClinton|She gained about ...|   106|
| HillaryClinton|Its NationalVoter...|    65|
| HillaryClinton|I love this count...|   116|
| HillaryClinton|We dont want to t...|   130|
| HillaryClinton|What we hear from...|   108|
| HillaryClinton|One candidate mad...|   111|
| HillaryClinton|What kind of a pe

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [8]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [9]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [10]:
# Show label and resulting features
cleaned.show()

+---------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|         handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+---------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
| HillaryClinton|The question in t...|    95|  1.0|[the, question, i...|[question, electi...|(262144,[75042,89...|(262144,[75042,89...|(262145,[75042,89...|
| HillaryClinton|If we stand toget...|    76|  1.0|[if, we, stand, t...|[stand, together,...|(262144,[37598,46...|(262144,[37598,46...|(262145,[37598,46...|
| HillaryClinton|Both candidates w...|    95|  1.0|[both, candidates...|[candidates, aske...|(262144,[53231,84...|(262144,[53231,84...|(262145,[53231,84...|
|realDonaldTrump|Join me for a 3pm...|    89|  0.0|[join, 

In [11]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])
#split csv into training and testing csvs, clean both, and use testing file in thing below
#testing.show()
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)



In [12]:
predictor

NaiveBayes_328a856c1eee

In [13]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|HillaryClinton|14 We know you en...|   118|  1.0|[14, we, know, yo...|[14, know, engage...|(262144,[17893,18...|(262144,[17893,18...|(262145,[17893,18...|[-984.52661116209...|[2.25317417619603...|       1.0|
|HillaryClinton|16 Youve mentione...|   131|  1.0|[16, youve, menti...|[16, youve, menti...|(262144,[5083,147...|(262144,[5083,147...|(262145,[5083,147...|[-873.616

In [14]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.904699


In [None]:
testing.limit(2).show()

+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|        handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|HillaryClinton|14 We know you en...|   118|  1.0|[14, we, know, yo...|[14, know, engage...|(262144,[17893,18...|(262144,[17893,18...|(262145,[17893,18...|
|HillaryClinton|16 Youve mentione...|   131|  1.0|[16, youve, menti...|[16, youve, menti...|(262144,[5083,147...|(262144,[5083,147...|(262145,[5083,147...|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+



In [56]:
example=predictor.transform(testing)
example.show()

+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|HillaryClinton|14 We know you en...|   118|  1.0|[14, we, know, yo...|[14, know, engage...|(262144,[17893,18...|(262144,[17893,18...|(262145,[17893,18...|[-984.52661116209...|[2.25317417619603...|       1.0|
|HillaryClinton|16 Youve mentione...|   131|  1.0|[16, youve, menti...|[16, youve, menti...|(262144,[5083,147...|(262144,[5083,147...|(262145,[5083,147...|[-873.616

In [54]:
testing.show()

+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|        handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|HillaryClinton|14 We know you en...|   118|  1.0|[14, we, know, yo...|[14, know, engage...|(262144,[17893,18...|(262144,[17893,18...|(262145,[17893,18...|
|HillaryClinton|16 Youve mentione...|   131|  1.0|[16, youve, menti...|[16, youve, menti...|(262144,[5083,147...|(262144,[5083,147...|(262145,[5083,147...|
|HillaryClinton|20 years ago when...|   130|  1.0|[20, years, ago, ...|[20, years, ago, ...|(262144,[5232,120...|(262144,[5232,120...|(262145,[5232,120...|
|HillaryClinton|2016 and counting...|    95|  1.0|[2016, and, co

In [None]:
acc = acc_eval.evaluate(example2)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.666667


In [31]:
predictor.transform(testing.limit(1))
x=predictor.transform(testing.limit(1))

In [35]:
x.show()

+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|HillaryClinton|14 We know you en...|   118|  1.0|[14, we, know, yo...|[14, know, engage...|(262144,[17893,18...|(262144,[17893,18...|(262145,[17893,18...|[-984.52661116209...|[2.25317417619603...|       1.0|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+---------

In [52]:
testing.limit(1).show()

+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|        handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|HillaryClinton|14 We know you en...|   118|  1.0|[14, we, know, yo...|[14, know, engage...|(262144,[17893,18...|(262144,[17893,18...|(262145,[17893,18...|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+



In [45]:
tweet = testing.limit(1)

In [46]:
tweet.show()

+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|        handle|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|HillaryClinton|14 We know you en...|   118|  1.0|[14, we, know, yo...|[14, know, engage...|(262144,[17893,18...|(262144,[17893,18...|(262145,[17893,18...|
+--------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+



In [47]:
tweet = tweet.select(['features'])

In [49]:
result = predictor.transform(tweet)

In [50]:
result.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|(262145,[17893,18...|[-984.52661116209...|[2.25317417619603...|       1.0|
+--------------------+--------------------+--------------------+----------+



In [None]:
Lyin Ted Cruz will never be able to beat Hillary Despite a rigged delegate system I am hundreds of delegates ahead of him
Ill be in one of my favorite places this morning Staten Island Big crowd will be fun!
MakeAmericaGreatAgain NYPrimary
Thank you California Connecticut Maryland and Pennsylvania! MakeAmericaGreatAgain Trump2016
Just arrived in Syracuse NY Big crowd great place! We will bring back the desperately needed jobs NYPrimary

In [126]:
input = spark.createDataFrame([(0,"These comics are made for kids")], ["id","text"])

In [127]:

tweet = input.withColumn('length', length(input['text']))
prepared_tweet = cleaner.transform(tweet)
output = predictor.transform(prepared_tweet)
output.select(["text","prediction",'probability']).show(truncate=False)

+-------------------------------+----------+----------------------------------------+
|text                           |prediction|probability                             |
+-------------------------------+----------+----------------------------------------+
|These comics are made for kids!|0.0       |[0.8885188874518979,0.11148111254810224]|
+-------------------------------+----------+----------------------------------------+



+------------------------------------------------------------------------------------------------+----------+-------------------------------------------+
|text                                                                                            |prediction|probability                                |
+------------------------------------------------------------------------------------------------+----------+-------------------------------------------+
|Our job between now and election day is to make sure that we keep up the fight for voting rights|1.0       |[2.8336118893364374E-13,0.9999999999997167]|
+------------------------------------------------------------------------------------------------+----------+-------------------------------------------+

