# Installing pyspark

In [1]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.





Collecting pyspark
  Using cached pyspark-3.4.1-py2.py3-none-any.whl
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Importing the necessary libraries

In [2]:
import pandas as pd
import time
from collections import Counter
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower, trim, col
from pyspark.sql.types import StringType

# Implementing PySpark

## WordCount for the column of description - Top 20 

In [3]:

spark = SparkSession.builder.appName("WordCount").getOrCreate() # Starting a Spark session

netflix_df = spark.read.csv("netflix_titles.csv", header=True, inferSchema=True) # Loading the dataset to work with PySpark

words_df = netflix_df.select("description").withColumn("words", explode(split(col("description"), " "))) # Selecting the 'description' column
word_count_df = words_df.groupBy("words").count().orderBy("count", ascending=False) # Performing the word count

word_count_df.show() # Printing out the results

spark.stop() # End of spark session

+-----+-----+
|words|count|
+-----+-----+
|    a|10049|
|  the| 7629|
|   to| 6297|
|  and| 6205|
|   of| 5223|
|   in| 3638|
|  his| 3310|
| with| 2124|
|  her| 2050|
|  for| 1748|
|   an| 1698|
|   on| 1657|
|their| 1646|
|    A| 1471|
| from| 1180|
|   is| 1093|
|   as| 1068|
|   by|  986|
| this|  952|
|   he|  851|
+-----+-----+
only showing top 20 rows



## For "discription" - all word counts 

In [4]:
spark = SparkSession.builder.appName("NetflixWordCount").getOrCreate()
netflix_df = spark.read.csv("netflix_titles.csv", header=True, inferSchema=True)

words_df = netflix_df.select("description").withColumn("words", explode(split(lower(trim(col("description"))), " ")))

word_count_df = words_df.groupBy("words").count().orderBy("count", ascending=False) #Getting the word counts
word_count_df.show(word_count_df.count(), truncate=False) # printing all word count results

spark.stop()


+-------------------------------+-----+
|words                          |count|
+-------------------------------+-----+
|a                              |11520|
|the                            |7993 |
|to                             |6392 |
|and                            |6215 |
|of                             |5223 |
|in                             |4252 |
|his                            |3320 |
|with                           |2235 |
|her                            |2059 |
|an                             |1969 |
|for                            |1768 |
|on                             |1750 |
|their                          |1654 |
|when                           |1495 |
|this                           |1349 |
|from                           |1271 |
|as                             |1208 |
|is                             |1098 |
|by                             |992  |
|after                          |974  |
|he                             |862  |
|that                           |811  |


## Word count for the column of "Listed_in" - Top 20

In [5]:
spark = SparkSession.builder.appName("Listed_in_WordCount").getOrCreate()
netflix_df = spark.read.csv("netflix_titles.csv", header=True, inferSchema=True)

words_df = netflix_df.select("listed_in").withColumn("words", explode(split(col("listed_in"), " "))) # Splitting the 'description' column into words
word_count_df = words_df.groupBy("words").count().orderBy("count", ascending=False) # Perform word count

word_count_df.show() # Printing out the word count results

spark.stop() # Ending the Spark session

+--------------+-----+
|         words|count|
+--------------+-----+
|            TV| 5225|
| International| 4098|
|        Movies| 3413|
|             &| 2607|
|        Shows,| 2307|
|       Dramas,| 2274|
|       Movies,| 2259|
|     Comedies,| 1427|
|        Action| 1025|
|      Romantic|  986|
|        Dramas|  907|
|    Adventure,|  846|
|      Comedies|  823|
|   Independent|  751|
|        Family|  641|
|      Children|  641|
|     Thrillers|  630|
|         Shows|  601|
|Documentaries,|  485|
|         Crime|  469|
+--------------+-----+
only showing top 20 rows



In [6]:
spark = SparkSession.builder.appName("Listed_in_WordCount").getOrCreate()
netflix_df = spark.read.csv("netflix_titles.csv", header=True, inferSchema=True)

genres_df = netflix_df.select("listed_in").withColumn("genres", explode(split(lower(trim(col("listed_in"))), ",")))
genre_count_df = genres_df.groupBy("genres").count().orderBy("count", ascending=False)

genre_count_df.show(genre_count_df.count(), truncate=False) # Printing the results of all genre count

spark.stop()


+-----------------------------+-----+
|genres                       |count|
+-----------------------------+-----+
| international movies        |2622 |
|dramas                       |1595 |
|comedies                     |1206 |
|action & adventure           |857  |
|documentaries                |827  |
| dramas                      |824  |
|international tv shows       |774  |
| independent movies          |731  |
| tv dramas                   |695  |
| romantic movies             |613  |
|children & family movies     |605  |
| international tv shows      |576  |
| thrillers                   |508  |
| comedies                    |464  |
| tv comedies                 |461  |
|crime tv shows               |398  |
|kids' tv                     |388  |
| music & musicals            |355  |
| romantic tv shows           |338  |
|stand-up comedy              |334  |
|horror movies                |275  |
|british tv shows             |253  |
| sci-fi & fantasy            |230  |
|docuseries 

# Code for word count without using PySpark

In [7]:
netflix_df = pd.read_csv("netflix_titles.csv")

# Converting the combined text into lowercase and then split the data on the whitespaces
combined_text = ' '.join(netflix_df['description']).lower()
words = combined_text.split()

word_counts = Counter(words) # Counting the number of words
word_counts_ordered = dict(sorted(word_counts.items(), key=lambda x: x[1], reverse=True)) # Descending order

for word, count in word_counts_ordered.items(): # Print the word count results in Descending Order
    print(f"{word}: {count}")

a: 11609
the: 8106
to: 6439
and: 6320
of: 5273
in: 4334
his: 3352
with: 2261
her: 2077
an: 1993
for: 1782
on: 1763
their: 1669
when: 1512
this: 1395
from: 1291
as: 1224
is: 1111
by: 1004
after: 993
he: 871
that: 820
who: 807
but: 806
at: 739
young: 717
into: 713
new: 693
–: 606
life: 579
up: 574
they: 540
two: 495
she: 473
family: 454
man: 446
out: 418
woman: 415
must: 397
are: 382
while: 377
world: 372
love: 372
friends: 366
about: 353
him: 345
find: 336
one: 328
documentary: 313
finds: 312
where: 300
three: 292
series: 285
—: 281
be: 275
takes: 274
has: 272
help: 267
group: 252
take: 238
own: 232
them: 231
school: 225
through: 225
it: 223
save: 222
home: 210
lives: 210
all: 208
years: 207
girl: 206
against: 205
high: 205
get: 205
teen: 204
becomes: 201
back: 197
four: 195
between: 191
have: 190
true: 186
over: 184
team: 183
more: 179
mysterious: 177
other: 176
gets: 176
off: 173
can: 173
its: 166
father: 165
down: 164
best: 164
tries: 160
make: 159
falls: 158
become: 157
follows: 155

# Time comparison of word count with and without using PySpark

## Without PySpark

In [8]:
netflix_df_pandas = pd.read_csv("netflix_titles.csv")

start_time_without_pyspark = time.time()  # Starting the timer 

# Converting the combined text into lowercase and then split the data on the whitespaces
combined_text = ' '.join(netflix_df['description']).lower()
words = combined_text.split()

word_counts = Counter(words)

end_time_without_pyspark = time.time() # Ending the timer

## With PySpark

In [9]:
spark = SparkSession.builder.appName("NetflixWordCount").getOrCreate()

In [10]:
netflix_df_spark = spark.read.csv("netflix_titles.csv", header=True, inferSchema=True)# Load the Netflix dataset using PySpark

start_time_with_pyspark = time.time() # Starting the timer

words_df = netflix_df_spark.select("description").withColumn("words", explode(split(lower(trim("description")), " ")))
word_count_df = words_df.groupBy("words").count().orderBy("count", ascending=False) # Getting the word counts

end_time_with_pyspark = time.time() # Ending the timer

## Results

In [11]:
# Calculate and print the execution time for both cases
execution_time_without_pyspark = end_time_without_pyspark - start_time_without_pyspark
execution_time_with_pyspark = end_time_with_pyspark - start_time_with_pyspark

# Printing out the results of the execution time for both the implementation
print("Execution time without PySpark - ", execution_time_without_pyspark)
print("Execution time with PySpark - ", execution_time_with_pyspark)

Execution time without PySpark -  0.05500221252441406
Execution time with PySpark -  0.029001235961914062


In [12]:
spark.stop() # Stop the Spark session

# Word2Vec

## Importing libraries necessary for training the model

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

## Loading the Dataset - Limit 2000 rows

In [16]:
df = pd.read_csv("netflix_titles.csv")
text_data = df['description'].head(2000)  # Limit - 2000 rows

## Training the model

In [17]:
stop_words = set(stopwords.words('english')) # Preprocessing
preprocessed_data = []
for text in text_data:
    filtered_tokens = [token for token in word_tokenize(text.lower()) if token.isalnum() and token not in stop_words]  # Removing stop words and punctuation
    preprocessed_data.append(filtered_tokens)

model = Word2Vec(sentences=preprocessed_data, vector_size=100, window=5, min_count=1)


## Perform similarity search

### For the Word - "Love"

In [18]:
search_query = 'love'
if search_query in model.wv.key_to_index:
    similar_words = model.wv.most_similar(search_query, topn = 10)
    print(f"\nWords similar to '{search_query}' - ")
    for word, similarity in similar_words:
        print(f"{word}  -  {similarity}")
else:
    print(f"\n'{search_query}' is not in the vocabulary.")


Words similar to 'love' - 
new  -  0.7451076507568359
finds  -  0.7321029305458069
back  -  0.7265148758888245
family  -  0.7199164628982544
find  -  0.7049792408943176
man  -  0.7032656073570251
must  -  0.7010520696640015
one  -  0.6989811658859253
woman  -  0.6928442120552063
gets  -  0.6704856157302856


### For the Word - "Animated"

In [19]:
search_query = 'animated'
if search_query in model.wv.key_to_index:
    similar_words = model.wv.most_similar(search_query, topn = 10)
    print(f"\nWords similar to '{search_query}' - ")
    for word, similarity in similar_words:
        print(f"{word}  -  {similarity}")
else:
    print(f"\n'{search_query}' is not in the vocabulary.")


Words similar to 'animated' - 
kancharapalem  -  0.40468135476112366
beautiful  -  0.34732362627983093
posh  -  0.34727218747138977
batch  -  0.3457019627094269
account  -  0.3394588828086853
ku  -  0.3350369930267334
bridgerton  -  0.3281424939632416
known  -  0.3213864862918854
reawakens  -  0.3195810616016388
trafficker  -  0.31587454676628113


### For the Word - "Thriller"

In [20]:
search_query = 'thriller' 
if search_query in model.wv.key_to_index:
    similar_words = model.wv.most_similar(search_query, topn = 10)
    print(f"\nWords similar to '{search_query}' - ")
    for word, similarity in similar_words:
        print(f"{word}  -  {similarity}")
else:
    print(f"\n'{search_query}' is not in the vocabulary.")


Words similar to 'thriller' - 
conduct  -  0.45622217655181885
cares  -  0.3745500445365906
charts  -  0.34995177388191223
fields  -  0.32776308059692383
goh  -  0.32715311646461487
fin  -  0.31821611523628235
signs  -  0.31750214099884033
efsun  -  0.31745707988739014
comical  -  0.3148932158946991
honors  -  0.3126448094844818


### For the Word - "Drama"

In [21]:
search_query = 'drama'
if search_query in model.wv.key_to_index:
    similar_words = model.wv.most_similar(search_query, topn = 10)
    print(f"\nWords similar to '{search_query}' - ")
    for word, similarity in similar_words:
        print(f"{word}  -  {similarity}")
else:
    print(f"\n'{search_query}' is not in the vocabulary.")


Words similar to 'drama' - 
find  -  0.46528568863868713
two  -  0.4210563004016876
one  -  0.4091804027557373
man  -  0.406969279050827
assumes  -  0.39855629205703735
wife  -  0.39368996024131775
unexpected  -  0.3908958435058594
crime  -  0.38952353596687317
ajegunle  -  0.3881613612174988
lawyer  -  0.38752275705337524
