In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 42 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 49.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=02dfb333b36c504caf977780650dc8e98eedd30a0ec86ccc9df08a061a59eb14
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from pyspark.sql import functions as f

spark = (
    SparkSession.builder.appName("ModelTraining")
    .config("spark.executor.memory", "12g")
    .getOrCreate()
)

pd.options.display.max_columns = None
pd.options.display.max_rows = 30
pd.options.display.max_colwidth = 150

schema = "polarity FLOAT, id LONG, date_time TIMESTAMP, query STRING, user STRING, text STRING"
timestampformat = "EEE MMM dd HH:mm:ss zzz yyyy"

IN_PATH = "/content/drive/MyDrive/sentiment-140-training-data/CLEAN"
MODEL_PATH = "/content/drive/MyDrive/sentiment-140-training-data/MODEL"

spark_reader = spark.read.schema(schema)


df_clean = spark.read.parquet(IN_PATH).cache()

df_clean = (
    df_clean
    # Remove all numbers
    .withColumn("text", f.regexp_replace(f.col("text"), "[^a-zA-Z']", " "))
    # Remove all double/multiple spaces
    .withColumn("text", f.regexp_replace(f.col("text"), " +", " "))
    # Remove leading and trailing whitespaces
    .withColumn("text", f.trim(f.col("text")))
    # Remove repeated letters
    .withColumn("text", f.regexp_replace(f.col("text"), "((\w+)\1{1,})", ""))
    
    # Ensure we don't end up with empty rows
    .filter("text != ''")
)

data = df_clean.select("text", "polarity").coalesce(3).cache()

data = data.na.drop()

data = data.sample(0.1)

In [None]:
print( data.count() )

159283


In [None]:
data.groupBy("polarity").count().show()

+--------+-----+
|polarity|count|
+--------+-----+
|     0.0|79525|
|     4.0|79758|
+--------+-----+



In [None]:
#data.select("text").collect()

In [None]:
(training_data, validation_data, test_data) = data.randomSplit([0.7, 0.2, 0.1], seed=2020)

In [None]:

from pyspark.ml.feature import (
    StopWordsRemover,
    Tokenizer,
    HashingTF,
    IDF,
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Tokenizer converts input string to lowercase and then splits it by white spaces.
# https://spark.apache.org/docs/2.4.3/api/python/pyspark.ml.html#pyspark.ml.feature.Tokenizer
# Params:
tokenizer = Tokenizer(inputCol="text", outputCol="words1")

# A feature transformer that filters out stop words from input.
# https://spark.apache.org/docs/2.4.3/api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover
# Params:
stopwords_remover = StopWordsRemover(
    inputCol="words1",
    outputCol="words2",
    stopWords=StopWordsRemover.loadDefaultStopWords("english"),  # English stopwords
)

# Maps a sequence of terms to their term frequencies using the hashing trick
# https://spark.apache.org/docs/2.4.3/api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF
# Params:
hashing_tf = HashingTF(inputCol="words2", outputCol="term_frequency")

# Compute the Inverse Document Frequency (IDF) given a collection of documents
# https://spark.apache.org/docs/2.4.3/api/python/pyspark.ml.html#pyspark.ml.feature.IDF
# Params:
idf = IDF(
    inputCol="term_frequency",
    outputCol="features",
    minDocFreq=5,  # minDocFreq: remove sparse terms
)

lr = LogisticRegression(labelCol="polarity",featuresCol="features")

semantic_analysis_pipeline = Pipeline(
    stages=[tokenizer, stopwords_remover, hashing_tf, idf,lr]
)

semantic_analysis_model = semantic_analysis_pipeline.fit(training_data)

In [None]:
semantic_analysis_model

PipelineModel_a42a37816142

In [None]:
%%time
trained_df = semantic_analysis_model.transform(training_data)
val_df = semantic_analysis_model.transform(validation_data)
test_df = semantic_analysis_model.transform(test_data)

#trained_df.show()
#val_df.show()
#test_df.show()

CPU times: user 196 ms, sys: 38.9 ms, total: 235 ms
Wall time: 1.23 s


In [None]:
# StopWordsRemover.loadDefaultStopWords("english")

In [None]:
%%time
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="polarity", metricName="accuracy")
accuracy_val = evaluator.evaluate(val_df)
accuracy_test = evaluator.evaluate(test_df)
print("Validation Data:")
print(f"Accuracy: {accuracy_val*100:.5f}%")
print("Testing Data:")
print(f"Accuracy: {accuracy_test*100:.5f}%")

Validation Data:
Accuracy: 75.07027%
Testing Data:
Accuracy: 75.34869%
CPU times: user 95.1 ms, sys: 4.72 ms, total: 99.9 ms
Wall time: 12.4 s


In [None]:
#final_model = semantic_analysis_pipeline.fit(data)
semantic_analysis_model.save(MODEL_PATH+"NEW_MODEL")

In [None]:
spark.stop()