In [5]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark Sentiment Analysis") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [6]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA, BisectingKMeans
from pyspark.sql.functions import monotonically_increasing_id
import re
from textblob import TextBlob

In [18]:
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("../data/newtwitter.csv",header=True);

In [19]:
df.show(5)

+--------------------+----------+-------+
|                text|        id|pubdate|
+--------------------+----------+-------+
|10 Things Missing...|2602860537|  18536|
|RT @_NATURALBWINN...|2602850443|  18536|
|RT @HBO24 yo the ...|2602761852|  18535|
|Aaaaaaaand I have...|2602738438|  18535|
|can I please have...|2602684185|  18535|
+--------------------+----------+-------+
only showing top 5 rows



In [20]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


def strip_non_ascii(string):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in string if 0 < ord(c) < 127)
    return ''.join(stripped)

strip_non_ascii_udf = udf(strip_non_ascii, StringType())

def fix_abbreviation(string):
    "fix abbreviations in tweet"
      

In [21]:
df = df.withColumn('text_non_asci',strip_non_ascii_udf(df['text']))

In [22]:
df.show(5)

+--------------------+----------+-------+--------------------+
|                text|        id|pubdate|       text_non_asci|
+--------------------+----------+-------+--------------------+
|10 Things Missing...|2602860537|  18536|10 Things Missing...|
|RT @_NATURALBWINN...|2602850443|  18536|RT @_NATURALBWINN...|
|RT @HBO24 yo the ...|2602761852|  18535|RT @HBO24 yo the ...|
|Aaaaaaaand I have...|2602738438|  18535|Aaaaaaaand I have...|
|can I please have...|2602684185|  18535|can I please have...|
+--------------------+----------+-------+--------------------+
only showing top 5 rows



In [7]:
rawdata_list =[
    (100, 1,  'this is the best product, i love it.'),
    (101, 0,  'this has been a great experience but the product was not satisfactory.'),
    (102, 1,  'the product is awesome.'),
    (103, -1, 'i hate the product and the price is terrible'),
    (104, 1,  'the product works great and it is awesome to use'),
    (105, -1, 'customer service was helpful but the product is expensive and it is bad')
    ]

rawdata = spark.createDataFrame(rawdata_list, ['id','lable','text'])

In [8]:
def sentiment_analysis(text):
    return TextBlob(text).sentiment.polarity

In [10]:
udf_cleantext = udf(sentiment_analysis , FloatType())
text_variable = 'text'
clean_text    = rawdata.withColumn("sentiment_score", udf_cleantext( rawdata[text_variable] ))
clean_text.show(10,True)

+---+-----+--------------------+---------------+
| id|lable|                text|sentiment_score|
+---+-----+--------------------+---------------+
|100|    1|this is the best ...|           0.75|
|101|    0|this has been a g...|            0.8|
|102|    1|the product is aw...|            1.0|
|103|   -1|i hate the produc...|           -0.9|
|104|    1|the product works...|            0.9|
|105|   -1|customer service ...|           -0.6|
+---+-----+--------------------+---------------+



In [11]:
import pandas as pd

# To read a CSV file
# df = pd.read_csv('sentences.csv')
df = pd.DataFrame({'sentence': ['I am very happy', 'I am very sad', 'I am sad but I am happy too']})

from textblob import TextBlob

# The x in the lambda function is a row (because I set axis=1)
# Apply iterates the function accross the dataframe's rows
df['polarity'] = df.apply(lambda x: TextBlob(x['sentence']).sentiment.polarity, axis=1)
df['subjectivity'] = df.apply(lambda x: TextBlob(x['sentence']).sentiment.subjectivity, axis=1)


In [4]:
print(df)

                      sentence  polarity  subjectivity
0              I am very happy      1.00           1.0
1                I am very sad     -0.65           1.0
2  I am sad but I am happy too      0.15           1.0
