<a href="https://colab.research.google.com/github/RajaSuhashKesari/MyDataEngineeringPractices/blob/main/Pyspark%20Programs/Top_N_words_from_the_Text_Story.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local[*]').appName("Top N words").getOrCreate()

In [8]:
text_data = spark.read.text("/content/words")

In [9]:
text_data.show()

+--------------------+
|               value|
+--------------------+
|The Echoes of the...|
|Once, in a forgot...|
|                    |
|It was said the f...|
|                    |
|But among the few...|
|                    |
|One evening, as t...|
|                    |
|Her grandfather h...|
|                    |
|With those words ...|
|                    |
|The forest greete...|
|                    |
|As she walked, sh...|
|                    |
|Hours passed, tho...|
|                    |
|Isla shook her he...|
+--------------------+
only showing top 20 rows



Remove nulls

In [17]:
dropped_empty_df = text_data.filter(col('value') != '')
dropped_empty_df.show()

+--------------------+
|               value|
+--------------------+
|The Echoes of the...|
|Once, in a forgot...|
|It was said the f...|
|But among the few...|
|One evening, as t...|
|Her grandfather h...|
|With those words ...|
|The forest greete...|
|As she walked, sh...|
|Hours passed, tho...|
|Isla shook her he...|
|Just as she was a...|
|"Come closer, chi...|
|Her heart skipped...|
|"Come closer." Th...|
|She pushed throug...|
|At the base of th...|
|She knelt beside ...|
|"You are the chos...|
|Isla’s heart race...|
+--------------------+
only showing top 20 rows



In [18]:
words_df = dropped_empty_df.select(explode(split(col('value'),' ')).alias("words"))
words_df.show()

+---------+
|    words|
+---------+
|      The|
|   Echoes|
|       of|
|      the|
|   Forest|
|    Once,|
|       in|
|        a|
|forgotten|
|   corner|
|       of|
|      the|
|   world,|
|    there|
|      was|
|        a|
|   forest|
|    known|
|       as|
|      the|
+---------+
only showing top 20 rows



## **Function to remove fullstop and commas**

In [45]:
def fullstop_and_comma_remover_convert_to_lower(word):
  if word is None:
    return None
  if '.' in word : return word[:-1].lower()
  if ',' in word : return word[:-1].lower()
  if word.endswith('"') : return word[:-1].lower()
  if word.startswith('"') : return word[1:].lower()
  else : return word.lower()

In [46]:
from pyspark.sql.types import StringType
udf_fcrfsctl = udf(fullstop_and_comma_remover_convert_to_lower,StringType())

In [47]:
cleaned_df_words = words_df.withColumn("words",udf_fcrfsctl(col("words")))
cleaned_df_words.show()

+---------+
|    words|
+---------+
|      the|
|   echoes|
|       of|
|      the|
|   forest|
|     once|
|       in|
|        a|
|forgotten|
|   corner|
|       of|
|      the|
|    world|
|    there|
|      was|
|        a|
|   forest|
|    known|
|       as|
|      the|
+---------+
only showing top 20 rows



In [48]:
counts_df = cleaned_df_words.groupBy('words').agg(count(col('words')).alias("counts"))
counts_df.show()

+-------------+------+
|        words|counts|
+-------------+------+
|        those|     4|
|          few|     1|
|      embrace|     1|
|        still|     1|
|        inner|     1|
|    connected|     2|
|    recognize|     1|
|        often|     1|
|     slightly|     1|
|     vibrated|     1|
|   surrounded|     1|
|        thick|     3|
|         you.|     1|
|      verdant|     3|
|        among|     2|
|        sense|     2|
|    insistent|     1|
|         pine|     1|
|        ahead|     1|
|reflection—an|     1|
+-------------+------+
only showing top 20 rows



In [59]:
sorted_df = counts_df.sort(desc(col('counts')))
sorted_df.cache()
sorted_df.show()

+---------+------+
|    words|counts|
+---------+------+
|      the|   139|
|       of|    53|
|      and|    42|
|      she|    38|
|        a|    38|
|      her|    38|
|       to|    37|
|      was|    26|
|   forest|    23|
|       it|    21|
|       in|    19|
|      its|    18|
|      had|    15|
|     that|    14|
|     with|    13|
|     isla|    12|
|       as|    12|
|    trees|    10|
|something|    10|
|   seemed|     9|
+---------+------+
only showing top 20 rows



In [60]:
top_n = 13
sorted_df.limit(top_n).show()

+------+------+
| words|counts|
+------+------+
|   the|   139|
|    of|    53|
|   and|    42|
|   she|    38|
|     a|    38|
|   her|    38|
|    to|    37|
|   was|    26|
|forest|    23|
|    it|    21|
|    in|    19|
|   its|    18|
|   had|    15|
+------+------+

