In [115]:
# https://towardsdatascience.com/sentiment-analysis-with-pyspark-bc8e83f80c35

In [1]:
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *
import sparknlp

from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SQLContext, Row
from pyspark.ml.linalg import Vectors

import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/trom/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# csv = './train/training.1600000.processed.noemoticon.csv' #file at https://1drv.ms/u/s!AqlC23XtB27BisoM5u56CMPeNOBQKw
# df = pd.read_csv(csv, header=None, encoding="ISO-8859-1", usecols=[0,5], names=['target', 'text'])
# df = df[['text', 'target']]

# df.dropna(inplace=True)
# df.reset_index(drop=True,inplace=True)

# df.loc[df.target == 0, 'target'] = -1
# df.loc[df.target == 2, 'target'] = 0
# df.loc[df.target == 4, 'target'] = 1

# # df.to_csv('./train/clean_tweet.csv', index=False)

# train_df, val_df = train_test_split(df, test_size=0.21)

# train_df.to_csv('./train/clean_tweet_train.csv', index=False)
# val_df.to_csv('./train/clean_tweet_val.csv', index=False)

# train_set.head()

In [54]:
# https://blog.sicara.com/get-started-pyspark-jupyter-guide-tutorial-ae2fe84f594f
spark = sparknlp.start()
sc = spark.sparkContext
sqlCtx = SQLContext(sc)

In [3]:
train_set = sqlCtx.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('./train/clean_tweet_train.csv')
val_set = sqlCtx.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('./train/clean_tweet_val.csv')

train_set = train_set.dropna()
val_set = val_set.dropna()

In [4]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [40]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline, PipelineModel

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

+--------------------+------+--------------------+--------------------+--------------------+-----+
|                text|target|               words|                  tf|            features|label|
+--------------------+------+--------------------+--------------------+--------------------+-----+
|Tired, and I feel...|    -1|[tired,, and, i, ...|(65536,[14,2647,8...|(65536,[14,2647,8...|  0.0|
|@PunkyStyle I wil...|     1|[@punkystyle, i, ...|(65536,[19387,238...|(65536,[19387,238...|  1.0|
|@marzwah @groundv...|    -1|[@marzwah, @groun...|(65536,[9265,1222...|(65536,[9265,1222...|  0.0|
|kind of sort of a...|    -1|[kind, of, sort, ...|(65536,[1431,6052...|(65536,[1431,6052...|  0.0|
|Updated the banne...|    -1|[updated, the, ba...|(65536,[8436,1056...|(65536,[8436,1056...|  0.0|
+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [17]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

In [8]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.8494658040471438

In [48]:
lrModel.save('./train/lr.model')
pipelineFit.save('./train/pipeline.model')

In [49]:
lrModel = LogisticRegressionModel.load("./train/lr.model")
pipelineModel = PipelineModel.load('./train/pipeline.model')

In [50]:
def score_text(text):
    df = spark.createDataFrame([(text, 2)], ['text', 'target'])
    df_transformed = pipelineModel.transform(df) # To fix
    predictions = lrModel.transform(df_transformed)
    predictions = predictions.select(['text', 'probability', 'prediction'])
    pd_predictions = predictions.toPandas()
    positive_probability = pd_predictions.iloc[0]['probability'][1]
    overall_probability = 2 * positive_probability - 1
    
    return overall_probability

In [51]:
score_text("good")

0.5232641500469681

In [52]:
score_text("bad")

-0.4497107663079577

In [53]:
score_text("good bad")

-0.04811019089291102

In [34]:
score_text("good bad good")

0.369671838265633

In [130]:
spark.stop()