In [1]:
import pandas as pd
from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import IntegerType, ArrayType, BooleanType, StringType
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer,NGram,HashingTF,IDF
from pyspark.sql.functions import concat,col
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit,CrossValidator

Connexion Spark

In [2]:
spark = SparkSession \
    .builder \
    .appName("test") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
#Import du fichier CSV:

df = spark.read.option("delimiter", ";").option("header", True).csv("Twitter.csv")
df.count()

43374

In [4]:
dfn=df.dropna()
dfn.count()

43013

In [5]:
df1 = dfn.withColumn("Note", when(dfn.sentiment =="Positive" ,1)
                                                    .when(dfn.sentiment=="Negative" ,0))
df1.show()

+--------+-----------+---------+--------------------+----+
|Tweet ID|     entity|sentiment|       Tweet content|Note|
+--------+-----------+---------+--------------------+----+
|    2401|Borderlands| Positive|im getting on bor...|   1|
|    2401|Borderlands| Positive|I am coming to th...|   1|
|    2401|Borderlands| Positive|im getting on bor...|   1|
|    2401|Borderlands| Positive|im coming on bord...|   1|
|    2401|Borderlands| Positive|im getting on bor...|   1|
|    2401|Borderlands| Positive|im getting into b...|   1|
|    2402|Borderlands| Positive|So I spent a few ...|   1|
|    2402|Borderlands| Positive|So I spent a coup...|   1|
|    2402|Borderlands| Positive|So I spent a few ...|   1|
|    2402|Borderlands| Positive|So I spent a few ...|   1|
|    2402|Borderlands| Positive|2010 So I spent a...|   1|
|    2402|Borderlands| Positive|                 was|   1|
|    2404|Borderlands| Positive|that was the firs...|   1|
|    2404|Borderlands| Positive|this was the firs...|   

In [6]:
#Tokenzed
tokenizer = Tokenizer(inputCol="Tweet content", outputCol="words")
tokenized = tokenizer.transform(df1)
tokenized.show()

+--------+-----------+---------+--------------------+----+--------------------+
|Tweet ID|     entity|sentiment|       Tweet content|Note|               words|
+--------+-----------+---------+--------------------+----+--------------------+
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|I am coming to th...|   1|[i, am, coming, t...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|im coming on bord...|   1|[im, coming, on, ...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|im getting into b...|   1|[im, getting, int...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spent a coup...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spe

In [7]:
#Stop Words
remover = StopWordsRemover()
remover.setInputCol("words")
remover.setOutputCol("Resultat")
df= remover.transform(tokenized)
df.show()

+--------+-----------+---------+--------------------+----+--------------------+--------------------+
|Tweet ID|     entity|sentiment|       Tweet content|Note|               words|            Resultat|
+--------+-----------+---------+--------------------+----+--------------------+--------------------+
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|I am coming to th...|   1|[i, am, coming, t...|[coming, borders,...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|im coming on bord...|   1|[im, coming, on, ...|[im, coming, bord...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|im getting into b...|   1|[im, getting, int...|[im, getting, bor...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|[spent, hour

In [8]:
#Ngram
ngram = NGram(n=2)
ngram.setInputCol("Resultat")
ngram.setOutputCol("Ngram")
ngramDataFrame = ngram.transform(df)
ngramDataFrame.select("Ngram").show()

+--------------------+
|               Ngram|
+--------------------+
|[im getting, gett...|
|[coming borders, ...|
|[im getting, gett...|
|[im coming, comin...|
|[im getting, gett...|
|[im getting, gett...|
|[spent hours, hou...|
|[spent couple, co...|
|[spent hours, hou...|
|[spent hours, hou...|
|[2010 spent, spen...|
|                  []|
|[first borderland...|
|[first borderland...|
|[first borderland...|
|[first borderland...|
|[first real, real...|
|[first borderland...|
|[biggest dissappo...|
|[biggest disappoi...|
+--------------------+
only showing top 20 rows



In [9]:
#Hashing
hashingTF = HashingTF(inputCol="Ngram", outputCol="features")
hashingTF.setNumFeatures(2)

ls=hashingTF.transform(ngramDataFrame)
ls.select("features","Ngram").show(3,truncate=False)

+-------------------+---------------------------------------------------------------+
|features           |Ngram                                                          |
+-------------------+---------------------------------------------------------------+
|(2,[0,1],[1.0,3.0])|[im getting, getting borderlands, borderlands murder, murder ,]|
|(2,[1],[3.0])      |[coming borders, borders kill, kill all,]                      |
|(2,[0,1],[1.0,3.0])|[im getting, getting borderlands, borderlands kill, kill all,] |
+-------------------+---------------------------------------------------------------+
only showing top 3 rows



In [10]:
#IDF
idf = IDF(inputCol="features", outputCol="idf")
idfModel = idf.fit(ls)
rescaledData = idfModel.transform(ls)

rescaledData.select("idf").show()

+--------------------+
|                 idf|
+--------------------+
|(2,[0,1],[0.12567...|
|(2,[1],[0.3713448...|
|(2,[0,1],[0.12567...|
|(2,[0,1],[0.37703...|
|(2,[0,1],[0.25135...|
|(2,[0,1],[0.25135...|
|(2,[0,1],[1.13110...|
|(2,[0,1],[1.50813...|
|(2,[0,1],[0.50271...|
|(2,[0,1],[1.00542...|
|(2,[0,1],[1.13110...|
|           (2,[],[])|
|(2,[0,1],[1.25677...|
|(2,[0,1],[1.00542...|
|(2,[0,1],[1.25677...|
|(2,[0,1],[1.38245...|
|(2,[0,1],[1.50813...|
|(2,[0,1],[0.62838...|
|(2,[0,1],[0.75406...|
|(2,[0,1],[0.50271...|
+--------------------+
only showing top 20 rows



In [24]:
#Split
splits = rescaledData.randomSplit([0.7, 0.3], 24)
pdf1=splits[1]
pdf0=splits[0]
pdf1.count()

12879

In [25]:
#LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8,featuresCol='idf',
    labelCol='Note')

# Fit the model
lrModel = lr.fit(pdf1)
lrModel=lrModel.transform(pdf1)

In [26]:
#BinaryClassificationEvaluator
evaluator2 = BinaryClassificationEvaluator(labelCol="Note", rawPredictionCol="idf", metricName='areaUnderROC')
evaluator2.evaluate(lrModel)

0.4482610883029995

In [27]:
#Pipline
pipeline = Pipeline(stages=[tokenizer,remover,ngram, hashingTF, idf , lr])

In [19]:
model = pipeline.fit(df1)

In [29]:
prediction = model.transform(df1)

+--------+-----------+---------+--------------------+----+
|Tweet ID|     entity|sentiment|       Tweet content|Note|
+--------+-----------+---------+--------------------+----+
|    2401|Borderlands| Positive|im getting on bor...|   1|
|    2401|Borderlands| Positive|I am coming to th...|   1|
|    2401|Borderlands| Positive|im getting on bor...|   1|
|    2401|Borderlands| Positive|im coming on bord...|   1|
|    2401|Borderlands| Positive|im getting on bor...|   1|
|    2401|Borderlands| Positive|im getting into b...|   1|
|    2402|Borderlands| Positive|So I spent a few ...|   1|
|    2402|Borderlands| Positive|So I spent a coup...|   1|
|    2402|Borderlands| Positive|So I spent a few ...|   1|
|    2402|Borderlands| Positive|So I spent a few ...|   1|
|    2402|Borderlands| Positive|2010 So I spent a...|   1|
|    2402|Borderlands| Positive|                 was|   1|
|    2404|Borderlands| Positive|that was the firs...|   1|
|    2404|Borderlands| Positive|this was the firs...|   