In [1]:
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext


from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, LinearSVC, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline



In [2]:

sc = ps.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)


In [3]:
df_train = sqlContext.read.format('com.databricks.spark.csv').options(header = False, inferschema = True, sep = ";").load("train.csv")
df_train= df_train.withColumnRenamed('_c0','description').withColumnRenamed('_c1','feeling')
df_train.show(10)

+--------------------+--------+
|         description| feeling|
+--------------------+--------+
|i didnt feel humi...| sadness|
|i can go from fee...| sadness|
|im grabbing a min...|   anger|
|i am ever feeling...|    love|
|i am feeling grouchy|   anger|
|ive been feeling ...| sadness|
|ive been taking o...|surprise|
|i feel as confuse...|    fear|
|i have been with ...|     joy|
| i feel romantic too|    love|
+--------------------+--------+
only showing top 10 rows



In [4]:
df_test = sqlContext.read.format('com.databricks.spark.csv').options(header = False, inferschema = True, sep = ";").load("val.csv")
df_test= df_test.withColumnRenamed('_c0','description').withColumnRenamed('_c1','feeling')
df_test.show(10)

+--------------------+-------+
|         description|feeling|
+--------------------+-------+
|im feeling quite ...|sadness|
|i feel like i am ...|sadness|
|i feel like a fai...|   love|
|i am just feeling...|  anger|
|i can have for a ...|    joy|
|i start to feel m...|    joy|
|i am feeling more...|    joy|
|i feel incredibly...|    joy|
|i feel less keen ...|    joy|
|i feel dirty and ...|sadness|
+--------------------+-------+
only showing top 10 rows



In [5]:
print(df_train.count())
df_train.dropna()
df_train.count()

16000


16000

In [6]:
print(df_test.count())
df_test.dropna()
df_test.count()

2000


2000

In [7]:
label_stringIdx = StringIndexer(inputCol = 'feeling', outputCol = 'label')
tokenizer = Tokenizer(inputCol = 'description', outputCol = 'tokens')
hashingtf = HashingTF(numFeatures= 2**16, inputCol = "tokens", outputCol = 'tf')
idf = IDF(inputCol = "tf", outputCol = 'features', minDocFreq=3)

In [8]:
pipeline = Pipeline(stages = [label_stringIdx, tokenizer, hashingtf, idf ])

In [9]:
pipeline = pipeline.fit(df_train)
df_train = pipeline.transform(df_train)
df_test = pipeline.transform(df_test)

In [10]:
df_train.show(10)

+--------------------+--------+-----+--------------------+--------------------+--------------------+
|         description| feeling|label|              tokens|                  tf|            features|
+--------------------+--------+-----+--------------------+--------------------+--------------------+
|i didnt feel humi...| sadness|  1.0|[i, didnt, feel, ...|(65536,[8800,1903...|(65536,[8800,1903...|
|i can go from fee...| sadness|  1.0|[i, can, go, from...|(65536,[6661,8286...|(65536,[6661,8286...|
|im grabbing a min...|   anger|  2.0|[im, grabbing, a,...|(65536,[4501,4832...|(65536,[4501,4832...|
|i am ever feeling...|    love|  4.0|[i, am, ever, fee...|(65536,[1880,6661...|(65536,[1880,6661...|
|i am feeling grouchy|   anger|  2.0|[i, am, feeling, ...|(65536,[6661,1903...|(65536,[6661,1903...|
|ive been feeling ...| sadness|  1.0|[ive, been, feeli...|(65536,[3053,6661...|(65536,[3053,6661...|
|ive been taking o...|surprise|  5.0|[ive, been, takin...|(65536,[4214,1032...|(65536,[4214

In [11]:
df_test.show(10)

+--------------------+-------+-----+--------------------+--------------------+--------------------+
|         description|feeling|label|              tokens|                  tf|            features|
+--------------------+-------+-----+--------------------+--------------------+--------------------+
|im feeling quite ...|sadness|  1.0|[im, feeling, qui...|(65536,[4616,6661...|(65536,[4616,6661...|
|i feel like i am ...|sadness|  1.0|[i, feel, like, i...|(65536,[11650,121...|(65536,[11650,121...|
|i feel like a fai...|   love|  4.0|[i, feel, like, a...|(65536,[5918,1017...|(65536,[5918,1017...|
|i am just feeling...|  anger|  2.0|[i, am, just, fee...|(65536,[6661,1903...|(65536,[6661,1903...|
|i can have for a ...|    joy|  0.0|[i, can, have, fo...|(65536,[6661,1903...|(65536,[6661,1903...|
|i start to feel m...|    joy|  0.0|[i, start, to, fe...|(65536,[1714,3024...|(65536,[1714,3024...|
|i am feeling more...|    joy|  0.0|[i, am, feeling, ...|(65536,[6661,1413...|(65536,[6661,1413...|


In [12]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [13]:
nb = nb.fit(df_train)

In [14]:
predictions = nb.transform(df_test)

In [15]:
criterion = MulticlassClassificationEvaluator()
criterion.evaluate(predictions)

0.7388329322593327