In [1]:
# pip install pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import lower, split, explode, regexp_replace, length
from pyspark.sql.functions import count, desc
import time

In [2]:
# Set Spark Configuration
conf = SparkConf() \
    .setMaster("local[*]") \
    .setAppName("WordFrequency") \
    .setExecutorEnv("spark.executor.memory", "4g") \
    .setExecutorEnv("spark.driver.memory", "4g")


# Create a SparkSession
spark = SparkSession.builder \
    .config(conf = conf) \
    .getOrCreate()

23/05/24 19:32:27 WARN Utils: Your hostname, Tanmays-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.31.18.59 instead (on interface en0)
23/05/24 19:32:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/24 19:32:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/24 19:32:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
start_time = time.time()
stopwords = ['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'don',
 'should',
 'now']

# Load the dataset into a DataFrame
data_df = spark.read.text("/Users/tanmaysingla/Downloads/dataset_updated/data_16GB.txt")

# Split each line into individual words and convert to lowercase
words_df = data_df.select(explode(split(lower("value"), " ")).alias("word"))

# Remove non-alphanumeric characters and filter out stopwords
clean_words_df = words_df \
    .withColumn("word", regexp_replace("word", "[^a-zA-Z0-9]", "")) \
    .filter((~(words_df["word"] == "")) & (~words_df["word"].isin(stopwords)) & (length("word") > 6)) # & (length("word") > 6)

# Count the frequency of each word
word_counts_df = clean_words_df.groupBy("word").agg(count("*").alias("count"))

# Sort the word counts in descending order
sorted_word_counts_df = word_counts_df.orderBy(desc("count"))

# Take the top 100 words
top_100_words_df = sorted_word_counts_df.limit(100)

# Show the top 100 words and their frequencies
top_100_words_df.show(100,truncate=False)

algo_time = time.time() - start_time
print(algo_time)



+-------------+-------+
|word         |count  |
+-------------+-------+
|government   |2520220|
|million      |2254837|
|president    |2188344|
|percent      |2022484|
|company      |1665814|
|including    |1594883|
|another      |1536906|
|national     |1415822|
|country      |1295479|
|according    |1281615|
|business     |1275143|
|officials    |1195385|
|american     |1129214|
|international|1116045|
|financial    |1097723|
|support      |1080820|
|children     |1072798|
|however      |1072381|
|security     |1058058|
|billion      |1048928|
|without      |1039686|
|political    |1018911|
|european     |1018491|
|whether      |1014211|
|yearold      |981085 |
|tuesday      |969122 |
|already      |960625 |
|minister     |942494 |
|information  |927617 |
|wednesday    |920376 |
|reported     |917578 |
|expected     |916792 |
|economic     |910614 |
|thursday     |910326 |
|something    |896498 |
|military     |893358 |
|several      |877379 |
|companies    |869823 |
|washington   |8

                                                                                

In [None]:
# for words len > 6 in 2.5GB: 54.72753381729126 
# for all words in 2.5GB: 51.162511110305786

In [None]:
# for words len > 6 in 16GB: 301.3295159339905
# for all words in 16GB: 296.8359487056732