### Twitter Project - Analysis on Sampled Data

#### 1. Read the sampled data.
#### 2. Select the features that are of importance.
#### 3. Save the filtered data in a new parquet file.

In [1]:
#Ensure we are using the right kernel
import sys
print(sys.version)
print(spark.version)

3.8.15 | packaged by conda-forge | (default, Nov 22 2022, 08:46:39) 
[GCC 10.4.0]
3.1.3


In [2]:
import time
import pyspark

#### Tuning Spark to increase the memory

In [3]:
sc = spark.sparkContext
print('Original spark.driver.maxResultSize: ' + sc._conf.get('spark.driver.maxResultSize'))

# Stop existing Spark environment
sc.stop()

# Waiting for the environment to stop
sleep_time = 10
print(f'Waiting for {sleep_time} seconds for the enviroment to stop...')
time.sleep(sleep_time)

# Applying new configuration and restarting Spark
conf = pyspark.SparkConf().setAll([('spark.driver.maxResultSize', '8g')])
sc = pyspark.SparkContext(conf=conf)

print('New spark.driver.maxResultSize: ' + sc._conf.get('spark.driver.maxResultSize'))

# Starting  Spark session with configs applied
spark = SparkSession(sc).builder.getOrCreate()

Original spark.driver.maxResultSize: 1920m
Waiting for 10 seconds for the enviroment to stop...


23/02/28 20:15:06 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/02/28 20:15:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/02/28 20:15:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/02/28 20:15:06 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


New spark.driver.maxResultSize: 8g


In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_rows')
from itertools import compress 
from pyspark.sql.functions import *
from pyspark.sql.types import *
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore')
import os
import shutil
# import sh

In [5]:
# !pip uninstall -y nltk
# !pip install nltk --upgrade --no-cache-dir
# %pip install nltk -U

In [6]:
import nltk
# nltk.download('popular', halt_on_error=False)

In [7]:
import re
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import CountVectorizer,  IDF, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
# Display the spark DF in a beautified way
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

## To use legacy casting notation for date
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [10]:
filtered_df = spark.read.parquet('gs://msca-bdp-students-bucket/shared_data/saikrishnaj/twitter_sample')

23/02/28 20:15:54 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [12]:
filtered_df.limit(10)

                                                                                

coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,is_quote_status,lang,place,possibly_sensitive,quote_count,quoted_status,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_text,reply_count,retweet_count,retweeted,retweeted_from,retweeted_status,source,text,timestamp_ms,truncated,tweet_text,user,withheld_in_countries
,Thu Aug 11 11:29:...,,"{[], null, [], []...",,,0,False,low,,1557690755761815553,1557690755761815553,,,,,,False,en,,,0,,,,,,0,0,RT,EdinUniAFAF,"{null, Thu Aug 11...","<a href=""https://...",RT @EdinUniAFAF: ...,1660217391325,False,"""Athena Swan has ...","{false, Mon Sep 2...",
,Wed May 25 02:30:...,,"{[], null, [], []...",,,0,False,low,,1529288780707815424,1529288780707815424,,,,,,False,en,,,0,,,,,,0,0,RT,ErieNotEerie,"{null, Tue May 24...","<a href=""http://t...",RT @ErieNotEerie:...,1653445832789,False,The 14 children k...,"{false, Mon Nov 2...",
,Sat Sep 17 18:41:...,"[13, 140]","{[], null, [], [{...",,"{[13, 208], {[], ...",0,False,low,,1571207619343171586,1571207619343171586,MTN_College,1.571194535186731e+18,1.571194535186731e+18,1.4592288921625723e+18,1.4592288921625723e+18,False,en,,,0,,,,,,0,0,,,,"<a href=""http://t...",@MTN_College I ha...,1663440062734,True,@MTN_College I ha...,"{false, Wed Nov 1...",
,Wed May 25 12:34:...,,"{[], null, [], []...",,,0,False,low,,1529440878854561792,1529440878854561792,,,,,,False,en,,,0,,,,,,0,0,RT,KareemRifai,"{null, Tue May 24...","<a href=""http://t...",RT @KareemRifai: ...,1653482095813,False,Black people tryi...,"{false, Sat Aug 0...",
,Thu Aug 25 11:47:...,"[14, 74]","{[], null, [], []...",,,0,False,low,,1562768536899973120,1562768536899973120,olumuyiwaayo,1.5627680089046057e+18,1.5627680089046057e+18,330097232.0,330097232.0,False,en,,,0,,,,,,0,0,,,,"<a href=""http://t...",@olumuyiwaayo Con...,1661428028689,False,@olumuyiwaayo Con...,"{false, Fri Nov 2...",
,Wed May 25 23:27:...,"[51, 140]","{[], null, [], [{...",,"{[51, 272], {[], ...",0,False,low,,1529604981309022209,1529604981309022209,AmyAtrebas,1.5296044427729838e+18,1.5296044427729838e+18,8.009380056827987e+17,8.009380056827987e+17,False,en,,,0,,,,,,0,0,,Jyates5 @DrewKhri...,,"<a href=""http://t...",@AmyAtrebas @Jyat...,1653521220887,True,@AmyAtrebas @Jyat...,"{false, Wed Sep 2...",
,Sat May 07 04:19:...,,"{[], null, [], []...",,,0,False,low,,1522793260065906695,1522793260065906695,,,,,,False,en,,,0,,,,,,0,0,RT,mattia_n,"{null, Fri May 06...","<a href=""http://t...",RT @mattia_n: Ukr...,1651897179987,False,Ukrainians are in...,"{false, Wed Aug 1...",
,Wed May 25 22:44:...,,"{[], null, [], []...",,,0,False,low,,1529594374916558853,1529594374916558853,,,,,,False,en,,,0,,,,,,0,0,RT,EnglishTeach07,"{null, Tue May 24...","<a href=""http://t...",RT @EnglishTeach0...,1653518692126,False,You can’t be prol...,"{false, Wed Jan 1...",
,Sun Jan 29 17:53:...,,"{[], null, [], []...",,,0,False,low,,1619755573230325760,1619755573230325760,,,,,,True,en,,,0,"{null, Sun Jan 29...",1.6197428980562e+18,1.6197428980562e+18,{twitter.com/gal_...,The leader of #Am...,0,0,RT,SaudiaPaige,"{null, Sun Jan 29...","<a href=""http://t...",RT @SaudiaPaige: ...,1675014796918,False,It is their way o...,"{false, Thu Jan 1...",
,Wed May 25 11:45:...,,"{[], null, [], []...",,,0,False,low,,1529428531062751233,1529428531062751233,,,,,,False,en,,,0,,,,,,0,0,RT,JP_1U,"{null, Tue May 24...","<a href=""http://t...",RT @JP_1U: More k...,1653479151870,False,More kids have di...,"{false, Thu May 2...",


### Selecting the K-12/Education related tweets

In [44]:
## Remove all special characters such as hastags, mentions, etc. 

sampled_df = filtered_df\
.withColumn('tweet_text', lower('tweet_text'))\
.withColumn('stripped_text', regexp_replace(col("tweet_text"),"[\$#,&%\".]",""))

In [48]:
## Dictionary for words similar to 'education/K-12' 

edu_keywords = ['digital learning', 'mathematics', 'campus', 'graduation', 'MOOC', 'learning', 'synchronous learning', \
             'intellectual', 'study', 'literacy', 'certificate', 'library', 'academy', 'scholarship', 'asynchronous learning', \
             'history', 'teaching', 'school', 'academician', 'instructor', 'academic', 'pedagogy', 'mentoring', \
             'online education', 'degree', 'educationist', 'assignment', 'learning community', 'textbook', 'tutorial', \
             'blended learning', 'college', 'educator', 'syllabus', 'diploma', 'cognitive', 'curriculum', 'e-learning', \
             'student', 'lecture', 'coursework', 'virtual classroom', 'teacher', 'tuition', 'lesson', 'assessment', \
             'distance learning', 'k12', 'educational technology', 'educational', 'university', 'grade', 'knowledge', \
             'research', 'exam', 'classroom', 'mentorin...', 'online learning', 'professor', 'homework', 'educating', 'enrollment']

removal_words = ['guns', 'fashion', 'gaming', 'makeup', 'shooting', 'sports', 'business', 'gun', 'kill', 'food', \
                'news', 'travel', 'killed', 'murder', 'uvalde', 'health', 'shoot', 'deceased', 'movie', 'politics', 'beauty', 'horny', \
                'shootings', 'gunned', 'fitness', 'music', 'shopping', 'attack']


regex_edu ='|'.join(["(" + c +")" for c in keywords])

regex_removal ='|'.join(["(" + c +")" for c in filter_words])

sampled_filtered = sampled_df.where(sampled_df['tweet_text'].rlike(regex_edu)).\
where(~sampled_df['tweet_text'].rlike(regex_removal))

sampled_filtered.count()

                                                                                

8331

In [53]:
# Checking the words which had the most effect on filtering the tweet_text

res = []
for word in edu_keywords:
    count = sampled_filtered.filter('tweet_text like "%' + word + '%"').count()
    res.append([word, count])

res = sorted(res, key = lambda x:x[1], reverse = True)
res[:20]

                                                                                

[['school', 5482],
 ['college', 1529],
 ['university', 1158],
 ['student', 908],
 ['teacher', 431],
 ['professor', 425],
 ['history', 122],
 ['grade', 120],
 ['exam', 112],
 ['teaching', 106],
 ['study', 104],
 ['learning', 99],
 ['degree', 95],
 ['campus', 90],
 ['classroom', 69],
 ['research', 66],
 ['academic', 64],
 ['scholarship', 55],
 ['graduation', 48],
 ['tuition', 46]]

In [61]:
sampled_df.selectExpr('count(*) as before_filtering').show()
sampled_filtered.selectExpr('count(*) as after_filtering').show()

+----------------+
|before_filtering|
+----------------+
|           10000|
+----------------+





+---------------+
|after_filtering|
+---------------+
|           8331|
+---------------+



                                                                                

### Filtering the features - Feature Reduction

#### Screening the data for checking the presence of data in the columns i.e. percent present for each column

In [65]:
from pyspark.sql import functions as F

sampled_filtered.select([(F.count(F.when(F.col(c).isNull(), c))/F.count(F.lit(1))).alias(c) for c in sampled_filtered.schema.names]).limit(1)


                                                                                

coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,is_quote_status,lang,place,possibly_sensitive,quote_count,quoted_status,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_text,reply_count,retweet_count,retweeted,retweeted_from,retweeted_status,source,text,timestamp_ms,truncated,tweet_text,user,withheld_in_countries,stripped_text
0.9995198655623576,0.0,0.7919817548913696,0.0,0.9073340535349896,0.7935421918137079,0.0,0.0,0.0,0.9995198655623576,0.0,0.0,0.8228303925099028,0.8273916696675069,0.8273916696675069,0.8228303925099028,0.8228303925099028,0.0,0.0,0.9899171768095066,0.7774576881526828,0.0,0.9140559356619854,0.9140559356619854,0.9140559356619854,0.9140559356619854,0.9140559356619854,0.0,0.0,0.0,0.3182090985475933,0.3917897011163125,0.0,0.0,0.0,0.0,0.0,0.0,0.999639899171768,0.0


#### Selecting only the columns which do not have too much missing data in them

In [71]:
tweet_cols = ["coordinates","created_at","id_str","lang","possibly_sensitive","retweeted_status",
              "tweet_text","timestamp_ms","quoted_status","text"]

user_cols = ["created_at","description","favourites_count","followers_count","friends_count","id_str",
            "name","protected","screen_name","statuses_count","verified","withheld_in_countries","location"]

ent_cols = ["hashtags"]

retweet_cols = ["retweet_count","favorite_count","reply_count","quote_count"]

quoted_cols = ["quote_count"]

df = sampled_filtered.select([*[col('user.' + col_name).alias('user_' + col_name) for col_name in user_cols],
                                  *[col(col_name).alias('tweet_' + col_name) for col_name in tweet_cols],
                                  *[col('entities.' + col_name).alias(col_name) for col_name in ent_cols],
                            *[col('retweeted_status.' + col_name).alias(col_name) for col_name in retweet_cols]])\
.withColumn('user_created_at',to_timestamp(col('user_created_at'),'EEE MMM dd HH:mm:ss zzzzz yyyy'))\
.withColumn('tweet_created_at',to_timestamp(col('tweet_created_at'),'EEE MMM dd HH:mm:ss zzzzz yyyy'))

df.limit(10)

                                                                                

user_created_at,user_description,user_favourites_count,user_followers_count,user_friends_count,user_id_str,user_name,user_protected,user_screen_name,user_statuses_count,user_verified,user_withheld_in_countries,user_location,tweet_coordinates,tweet_created_at,tweet_id_str,tweet_lang,tweet_possibly_sensitive,tweet_retweeted_status,tweet_tweet_text,tweet_timestamp_ms,tweet_quoted_status,tweet_text,hashtags,retweet_count,favorite_count,reply_count,quote_count
2010-11-17 08:57:35,I’m a fan of a so...,48776,358,2152,216631784,Will Dickerson,False,wbdickerson3,22318,False,[],"Atlanta, GA",,2022-09-17 18:41:02,1571207619343171586,en,,,@mtn_college i ha...,1663440062734,,@MTN_College I ha...,[],,,,
2012-08-04 18:48:30,,86274,817,709,737224435,tri tri,False,Trinalauren,59655,False,[],"Gravesend, Brooklyn",,2022-05-25 12:34:55,1529440878854561792,en,,"{null, Tue May 24...",black people tryi...,1653482095813,,RT @KareemRifai: ...,[],41912.0,185368.0,261.0,807.0
2019-11-22 15:19:47,Bsc Finance|Fashi...,955,87,144,1197897410997051392,Funke Bamgbose,False,funke_olubunmi,581,False,[],,,2022-08-25 11:47:08,1562768536899973120,en,,,@olumuyiwaayo con...,1661428028689,,@olumuyiwaayo Con...,[],,,,
2017-09-27 00:44:56,"Native Texan, tac...",70097,2064,2777,912840443473342465,LiberalGatorNTX ?...,False,LiberalGatorNTX,7586,False,[],"Frisco, TX",,2022-05-25 23:27:00,1529604981309022209,en,,,@amyatrebas @jyat...,1653521220887,,@AmyAtrebas @Jyat...,[],,,,
2011-08-10 10:48:18,Vigilante Etymolo...,78947,1052,2284,352252700,Neal Shepperson.?...,False,NealShepperson,117912,False,[],Tigguo Cobauc,,2022-05-07 04:19:39,1522793260065906695,en,,"{null, Fri May 06...",ukrainians are in...,1651897179987,,RT @mattia_n: Ukr...,[],4252.0,27871.0,234.0,429.0
2017-01-19 00:16:10,#RESIST #KEEPKRAT...,212765,11205,11541,821873849176358912,Kathy Huffman☮️💙,False,KathyHoldshope,354261,False,[],,,2023-01-29 17:53:16,1619755573230325760,en,,"{null, Sun Jan 29...",it is their way o...,1675014796918,"{null, Sun Jan 29...",RT @SaudiaPaige: ...,[],8.0,10.0,0.0,0.0
2011-05-26 18:37:57,European Academic...,173,3398,4850,305741660,DrAnnaNotaro,False,notanna1,217645,False,[],Dundee (Scotland),,2022-05-25 11:45:51,1529428531062751233,en,,"{null, Tue May 24...",more kids have di...,1653479151870,,RT @JP_1U: More k...,[],2651.0,13675.0,27.0,50.0
2018-02-01 18:56:52,Talkin Truth is a...,3458,202,618,959138497536946176,Talkin Trut[h],False,TaIkintruth,2756,False,[],"Chicago, IL",,2022-05-25 16:56:11,1529506625421180928,en,,"{null, Tue May 24...",texas doesn't hav...,1653497771016,,RT @BettyBowers: ...,[],16450.0,58202.0,438.0,510.0
2022-03-04 06:17:29,NFT TREY AKA T M...,51,9,36,1499630046100738049,NFT TREY,False,NFTTREY_AMGANG,8,False,[],,,2022-04-07 01:51:32,1511884346797527041,en,,"{null, Wed Mar 02...",freaks university...,1649296292386,,RT @nftfreaks: Fr...,[],895.0,1357.0,975.0,58.0
2022-09-12 18:04:52,,894,91,870,1569386536164016133,ElectricChairman,False,MondoMegaman,628,False,[],,,2022-09-20 11:50:37,1572191497029844995,en,,"{null, Mon Sep 19...",our campaign agai...,1663674637454,,RT @realchrisrufo...,[],946.0,4009.0,126.0,80.0


In [75]:
print(f"Features before removing the columns: {len(sampled_filtered.columns)}")
print(f"Features after removing the columns: {len(df.columns)}")

Features before removing the columns: 40
Features after removing the columns: 28


## Save the filtered and feature selected data

In [77]:
sampled_filtered.write.format("parquet").\
mode('overwrite').\
save('gs://msca-bdp-students-bucket/shared_data/saikrishnaj/sampled_filtered_data')

df.write.format("parquet").\
mode('overwrite').\
save('gs://msca-bdp-students-bucket/shared_data/saikrishnaj/sampled_feature_reduced_data')

                                                                                

### Now apply the same methods of filtering and feature reduction on the entire dataset and take a sample out the filtered data for analysis