# Instalamos PySpark

In [None]:
!pip install pyspark -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


Extraemos los datos:

In [None]:
# Creamos la sesion de Spark

from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

sc = spark.sparkContext

!wget https://storage.googleapis.com/humai-datasets/datasets/Tweets.csv

In [None]:
import pandas as pd

df = pd.read_csv('Tweets.csv')[['airline_sentiment', 'text']]
raw_df = spark.createDataFrame(df)
raw_df.show()

+-----------------+--------------------+
|airline_sentiment|                text|
+-----------------+--------------------+
|          neutral|@VirginAmerica Wh...|
|         positive|@VirginAmerica pl...|
|          neutral|@VirginAmerica I ...|
|         negative|@VirginAmerica it...|
|         negative|@VirginAmerica an...|
|         negative|@VirginAmerica se...|
|         positive|@VirginAmerica ye...|
|          neutral|@VirginAmerica Re...|
|         positive|@virginamerica We...|
|         positive|@VirginAmerica it...|
|          neutral|@VirginAmerica di...|
|         positive|@VirginAmerica I ...|
|         positive|@VirginAmerica Th...|
|         positive|@VirginAmerica @v...|
|         positive|@VirginAmerica Th...|
|         negative|@VirginAmerica SF...|
|         positive|@VirginAmerica So...|
|         negative|@VirginAmerica  I...|
|         positive|I ❤️ flying @Virg...|
|         positive|@VirginAmerica yo...|
+-----------------+--------------------+
only showing top

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier

tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="words_filtered")
hashingTF = HashingTF(inputCol="words_filtered", outputCol="rawFeatures", numFeatures=20)

idf = IDF(inputCol="rawFeatures", outputCol="features")

# Convertimos las feature textuales en indices.
label_indexer = StringIndexer(inputCol='airline_sentiment', outputCol='airline_sentiment_label') #.fit(transformed_data)

# Creamos las mismas transformacion de los pasos anteriores de antemano.
dt = DecisionTreeClassifier(labelCol="airline_sentiment_label", featuresCol="features")

# Creamos el Pipeline que encadena las transformaciones.
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, label_indexer, dt])


(train_data, test_data) = raw_df.randomSplit([0.7, 0.3])

# Fitteamos todos los pasos del pipeline.
model = pipeline.fit(train_data)


In [None]:
label_map = dict(list(enumerate(model.stages[4].labels)))
print(f"Labels de cada sentiment: {label_map}")

Labels de cada sentiment: {0: 'negative', 1: 'neutral', 2: 'positive'}


In [None]:
predictions = model.transform(test_data)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="airline_sentiment_label",
    predictionCol="prediction",
    metricName="precisionByLabel"
)

for label_index, label_name in label_map.items():
    score = evaluator.evaluate(
        predictions, {
            evaluator.metricName: "precisionByLabel",
            evaluator.metricLabel: float(label_index),
        }
    )
    print(f"Score para la clase {label_name} = {round(score, 3)}")

Score para la clase negative = 0.629
Score para la clase neutral = 0.0
Score para la clase positive = 0.0


In [None]:
print("Labels reales (ground truth)")
print(predictions.toPandas()['airline_sentiment_label'].value_counts())
print("")

print("Labels predichas")
print(predictions.toPandas()['prediction'].value_counts())
print("")

Labels reales (ground truth)
0.0    2750
1.0     943
2.0     682
Name: airline_sentiment_label, dtype: int64

Labels predichas
0.0    4375
Name: prediction, dtype: int64

