## Part 1
#### 1. Read the data
#### 2. Create a sample of 10k observations.
#### 3. Save the sampled file in a parquet and use it to explore the filtering opportunities.

In [1]:
#Ensure we are using the right kernel
import sys
print(sys.version)
print(spark.version)

3.8.15 | packaged by conda-forge | (default, Nov 22 2022, 08:46:39) 
[GCC 10.4.0]
3.1.3


In [2]:
import time
import pyspark

### Tuning Spark to increase the memory

In [3]:
sc = spark.sparkContext
print('Original spark.driver.maxResultSize: ' + sc._conf.get('spark.driver.maxResultSize'))

# Stop existing Spark environment
sc.stop()

# Waiting for the environment to stop
sleep_time = 10
print(f'Waiting for {sleep_time} seconds for the enviroment to stop...')
time.sleep(sleep_time)

# Applying new configuration and restarting Spark
conf = pyspark.SparkConf().setAll([('spark.driver.maxResultSize', '8g')])
sc = pyspark.SparkContext(conf=conf)

print('New spark.driver.maxResultSize: ' + sc._conf.get('spark.driver.maxResultSize'))

# Starting  Spark session with configs applied
spark = SparkSession(sc).builder.getOrCreate()

Original spark.driver.maxResultSize: 1920m
Waiting for 10 seconds for the enviroment to stop...


23/02/28 18:27:18 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/02/28 18:27:18 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/02/28 18:27:18 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/02/28 18:27:18 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


New spark.driver.maxResultSize: 8g


In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_rows')
from itertools import compress 
from pyspark.sql.functions import *
from pyspark.sql.types import *
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore')
import os
import shutil
# import sh

In [5]:
# !pip uninstall -y nltk
# !pip install nltk --upgrade --no-cache-dir
# %pip install nltk -U

In [6]:
import nltk
# nltk.download('popular', halt_on_error=False)

In [7]:
import re
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import CountVectorizer,  IDF, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Display the spark DF in a beautified way

In [8]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [9]:
pd.set_option("max_colwidth", 100)

In [10]:
from google.cloud import storage

In [11]:
# List all files in given COS directory
def list_blobs(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    for blob in blobs:
        print(blob.name + '\t' + str(blob.size))

In [12]:
# List all files in given COS directory
def list_blobs_pd(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    blob_name = []
    blob_size = []
    
    for blob in blobs:
        blob_name.append(blob.name)
        blob_size.append(blob.size)

    blobs_df = pd.DataFrame(list(zip(blob_name, blob_size)), columns=['Name','Size'])

    blobs_df.style.format({"Size": "{:,.0f}"}) 
    
    return blobs_df

In [13]:
# Delete folder from COS bucket
def delete_folder(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    for blob in blobs:
        blob.delete()

In [14]:
# Reading data from open bucket, avaible to all students
bucket_read = 'msca-bdp-tweets'

# Saving results into individual bucket, students must update to their own bucket
bucket_write = 'msca-bdp-data-shared'

In [15]:
list_blobs_pd(bucket_read, 'final_project')

Unnamed: 0,Name,Size
0,final_project/,0
1,final_project/_SUCCESS,0
2,final_project/part-00000-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json,4500466
3,final_project/part-00001-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json,4107431
4,final_project/part-00002-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json,4672123
...,...,...
50692,final_project/part-50690-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json,11562361
50693,final_project/part-50691-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json,9132693
50694,final_project/part-50692-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json,15376390
50695,final_project/part-50693-aa6d3cb4-7022-4df2-9921-218307589ce2-c000.json,8586044


#### Read data and check the schema

In [18]:
%time
path = 'gs://msca-bdp-tweets/final_project'
twitter_raw = spark.read.json(path)

23/02/28 18:58:19 WARN org.apache.spark.sql.execution.datasources.SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
23/02/28 19:04:47 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [20]:
# Get the total number of rows in the DataFrame
total_rows = twitter_raw.count()

# Calculate the fraction of rows to sample
fraction = 10000 / total_rows

# Sample the DataFrame using the fraction
sampled_df = twitter_raw.sample(withReplacement=False, fraction=fraction, seed=13).limit(10000)



                                                                                

In [21]:
sampled_df.limit(10)

                                                                                

coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,is_quote_status,lang,place,possibly_sensitive,quote_count,quoted_status,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_text,reply_count,retweet_count,retweeted,retweeted_from,retweeted_status,source,text,timestamp_ms,truncated,tweet_text,user,withheld_in_countries
,Thu Dec 08 16:15:...,,"{[], null, [], []...",,,0,False,low,,1600886905293656064,1600886905293656064,,,,,,False,en,,,0,,,,,,0,0,RT,DoreenNasaasira,"{null, Wed Dec 07...","<a href=""http://t...",RT @DoreenNasaasi...,1670516155919,False,I'm a graduate bt...,"{false, Fri Oct 2...",
,Wed May 25 04:45:...,,"{[], null, [], []...",,,0,False,low,,1529322821880381440,1529322821880381440,,,,,,False,en,,,0,,,,,,0,0,RT,AndrewPollackFL,"{null, Wed May 25...","<a href=""http://t...",RT @AndrewPollack...,1653453948837,False,-Armed guard -Sin...,"{false, Sat Apr 3...",
,Tue Jul 05 16:07:...,,"{[], null, [], []...",,,0,False,low,,1544352398239404033,1544352398239404033,,,,,,False,en,,,0,,,,,,0,0,RT,joncoopertweets,"{null, Tue Jul 05...","<a href=""http://t...",RT @joncoopertwee...,1657037279077,False,ICYMI: Ron DeSant...,"{false, Wed Apr 1...",
,Wed May 25 03:31:...,,"{[], null, [], []...",,,0,False,low,,1529304220259823616,1529304220259823616,,,,,,False,en,,,0,,,,,,0,0,RT,KareemRifai,"{null, Tue May 24...","<a href=""http://t...",RT @KareemRifai: ...,1653449513865,False,Black people tryi...,"{false, Fri Dec 2...",
,Mon Aug 29 00:19:...,,"{[], null, [], []...",,,0,False,low,,1564045067249192960,1564045067249192960,,,,,,False,en,,,0,,,,,,0,0,RT,Booker4KY,"{null, Thu Aug 25...","<a href=""http://t...",RT @Booker4KY: Wh...,1661732377241,False,When Mitch McConn...,"{false, Wed Mar 1...",
,Mon Apr 18 17:33:...,,"{[{[126, 130], py...",,,0,False,low,,1516107699573837824,1516107699573837824,,,,,,False,en,,,0,,,,,,0,0,RT,BGThePlug,"{null, Sun Apr 10...","<a href=""http://t...",RT @BGThePlug: Eb...,1650303218154,False,Ebony thot sent t...,"{false, Sun Sep 1...",
,Wed May 25 13:18:...,,"{[], null, [], [{...",,"{[0, 190], {[], n...",0,False,low,,1529451728797839362,1529451728797839362,,,,,,False,en,,,0,,,,,,0,0,,,,"<a href=""http://t...",To those who didn...,1653484682641,True,To those who didn...,"{false, Thu Apr 1...",
,Wed May 25 14:01:...,,"{[], null, [], []...",,,0,False,low,,1529462723771351042,1529462723771351042,,,,,,False,en,,,0,,,,,,0,0,RT,chasestrangio,"{null, Wed May 25...","<a href=""http://t...",RT @chasestrangio...,1653487304047,False,“Protecting” kids...,"{false, Sun Aug 0...",
,Sun May 29 13:25:...,,"{[{[18, 29], Janu...",,,0,False,low,,1530903116920590336,1530903116920590336,,,,,,True,en,,,0,"{null, Fri May 27...",1.5302248138091848e+18,1.5302248138091848e+18,{twitter.com/reps...,The baby formula ...,0,0,RT,kurtbardella,"{null, Sun May 29...","<a href=""http://t...",RT @kurtbardella:...,1653830720535,False,#January6th NEVER...,"{false, Fri Aug 3...",
,Mon Aug 01 05:20:...,,"{[], null, [], []...",,,0,False,low,,1553974019673755649,1553974019673755649,,,,,,False,en,,,0,,,,,,0,0,,,,"<a href=""http://t...",I’m almost at my ...,1659331252388,False,I’m almost at my ...,"{false, Sun Oct 0...",


In [22]:
# Save the sampled DataFrame as a Parquet file
sampled_df.write.format("parquet").mode('overwrite').save('gs://msca-bdp-students-bucket/shared_data/saikrishnaj/twitter_sample')

                                                                                

In [23]:
!hadoop fs -ls 'gs://msca-bdp-students-bucket/shared_data/saikrishnaj/twitter_sample'

Found 2 items
-rwx------   3 root root          0 2023-02-28 20:13 gs://msca-bdp-students-bucket/shared_data/saikrishnaj/twitter_sample/_SUCCESS
-rwx------   3 root root   17102044 2023-02-28 20:13 gs://msca-bdp-students-bucket/shared_data/saikrishnaj/twitter_sample/part-00000-e3cc3432-8e6e-4e65-b49d-c79c10a55cb8-c000.snappy.parquet
