# Twitter Sentiment Analysis Big Data Project on AWS using PySpark

# 1. Mount the Storage

In [0]:
def mount_s3_bucket(access_key, secret_key, bucket_name, mount_folder):
  ACCESS_KEY_ID = access_key
  SECRET_ACCESS_KEY = secret_key
  ENCODED_SECRET_KEY = SECRET_ACCESS_KEY.replace("/", "%2F")

  print ("Mounting", bucket_name)

  try:
    # Unmount the data in case it was already mounted.
    dbutils.fs.unmount("/mnt/%s" % mount_folder)
    
  except:
    # If it fails to unmount it most likely wasn't mounted in the first place
    print ("Directory not unmounted: ", mount_folder)
    
  finally:
    # Lastly, mount our bucket.
    dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY_ID, ENCODED_SECRET_KEY, bucket_name), "/mnt/%s" % mount_folder)
    #dbutils.fs.mount("s3a://"+ ACCESS_KEY_ID + ":" + ENCODED_SECRET_KEY + "@" + bucket_name, mount_folder)
    print ("The bucket", bucket_name, "was mounted to", mount_folder, "\n")

In [0]:
# Set AWS programmatic access credentials
ACCESS_KEY = "AKIA6JFAL64VYFJQ2JMY"
SECRET_ACCESS_KEY = "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"

In [0]:
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'weclouddata/twitter','topics')

Mounting weclouddata/twitter
Directory not unmounted:  topics
The bucket weclouddata/twitter was mounted to topics 



In [0]:
%fs ls /mnt/topics

path,name,size,modificationTime
dbfs:/mnt/topics/AI/,AI/,0,0
dbfs:/mnt/topics/BankofCanada/,BankofCanada/,0,0
dbfs:/mnt/topics/BlackFriday/,BlackFriday/,0,0
dbfs:/mnt/topics/CERB/,CERB/,0,0
dbfs:/mnt/topics/CSIS/,CSIS/,0,0
dbfs:/mnt/topics/CanadaHousing/,CanadaHousing/,0,0
dbfs:/mnt/topics/ElonMusk/,ElonMusk/,0,0
dbfs:/mnt/topics/Flames/,Flames/,0,0
dbfs:/mnt/topics/Inflation/,Inflation/,0,0
dbfs:/mnt/topics/Interest_rate/,Interest_rate/,0,0


In [0]:
path = '/mnt/topics/ElonMusk/*/*/*/*/*'

## 2. Initiate a Spark session

In [0]:
from pyspark.sql import SparkSession
spark = (SparkSession
         .builder
         .appName('Twitter Big Data Sentiment/Topic Modelling')
         .getOrCreate()
        )
print('Session created')
sc = spark.sparkContext

Session created


## 3. Create a Schema

In [0]:
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType([
    StructField('id', StringType(), True),
    StructField('name', StringType(), True),
    StructField('screen_name', StringType(), True),
    StructField('tweet', StringType(), True),
    StructField('followers_count', StringType(), True),
    StructField('location', StringType(), True),
    StructField('geo', StringType(), True),
    StructField('created_at', StringType(), True)
])

### 4. Read the Data into the Schema

In [0]:
df = (spark
     .read
     .option('header', 'false')
     .option('delimiter','\t')
     .schema(schema)
     .csv(path))

In [0]:
display(df.take(5))

id,name,screen_name,tweet,followers_count,location,geo,created_at
1594755513945755648,J.E. Dyer ☘️,OptimisticCon,"Keep in mind, ""Trump banned on Twitter!"" is an integral element of the ""J6"" narrative that Trump fomented insurrect… https://t.co/ov7iRF3YSn",9016,,,Mon Nov 21 18:11:58 +0000 2022
1594755515485143052,Javier Perdomo,Javierperdomo,"RT @MattGertz: Elon Musk interacting with sycophantic right-wing influencers this weekend, a thread.",469,,,Mon Nov 21 18:11:58 +0000 2022
1594755517574164481,Casey Reilley,caseyreilley,"RT @MattGertz: Elon Musk interacting with sycophantic right-wing influencers this weekend, a thread.",180,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022
1594755519868043264,Val Ornelas,_surfcowgirl,"RT @elizableu: I’d like to make something else clear, I don’t work for Twitter, Elon Musk, any government, political party, group etc. I ru…",195,Merica,,Mon Nov 21 18:11:59 +0000 2022
1594755519981436961,Name Can't be Blank,adrenaline1073,RT @disclosetv: JUST IN - Elon Musk has reinstated Rep. Marjorie Taylor Greene's (R-GA) personal Twitter account.,915,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022


In [0]:
#Mount your own buckets
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'bootcamp-shadman/big_data_project/','my_bucket')

Mounting bootcamp-shadman/big_data_project/
Directory not unmounted:  my_bucket
The bucket bootcamp-shadman/big_data_project/ was mounted to my_bucket 



In [0]:
df

Out[146]: DataFrame[id: string, name: string, screen_name: string, tweet: string, followers_count: string, location: string, geo: string, created_at: string]

In [0]:
df.columns

Out[148]: ['id',
 'name',
 'screen_name',
 'tweet',
 'followers_count',
 'location',
 'geo',
 'created_at']

##### Checking for Null Values & drop the corresponding null rows

In [0]:
df.filter(col('tweet').isNull()).count()

Out[149]: 204

In [0]:
df2 = df.dropna()

In [0]:
df2.filter(col('tweet').isNull()).count()

Out[151]: 0

### 5. Data Labelling

In [0]:
# %pip install textblob

In [0]:
from textblob import TextBlob

def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return 'positive'
    elif sentiment < 0:
        return 'negative'
    else:
        return 'neutral'

In [0]:
sentiment_detection_udf = udf(get_sentiment, StringType())

In [0]:
#words = words.withColumn("polarity", polarity_detection_udf("word"))
df2 = df2.withColumn("sentiment", sentiment_detection_udf("tweet"))

In [0]:
display(df2.take(20))

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment
1594755513945755648,J.E. Dyer ☘️,OptimisticCon,"Keep in mind, ""Trump banned on Twitter!"" is an integral element of the ""J6"" narrative that Trump fomented insurrect… https://t.co/ov7iRF3YSn",9016,,,Mon Nov 21 18:11:58 +0000 2022,neutral
1594755515485143052,Javier Perdomo,Javierperdomo,"RT @MattGertz: Elon Musk interacting with sycophantic right-wing influencers this weekend, a thread.",469,,,Mon Nov 21 18:11:58 +0000 2022,neutral
1594755517574164481,Casey Reilley,caseyreilley,"RT @MattGertz: Elon Musk interacting with sycophantic right-wing influencers this weekend, a thread.",180,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022,neutral
1594755519868043264,Val Ornelas,_surfcowgirl,"RT @elizableu: I’d like to make something else clear, I don’t work for Twitter, Elon Musk, any government, political party, group etc. I ru…",195,Merica,,Mon Nov 21 18:11:59 +0000 2022,positive
1594755519981436961,Name Can't be Blank,adrenaline1073,RT @disclosetv: JUST IN - Elon Musk has reinstated Rep. Marjorie Taylor Greene's (R-GA) personal Twitter account.,915,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022,neutral
1594755520526778368,The Original Johnboy 🇺🇸uLtRA MAgA🇺🇸 #WPS,johnboy02131989,RT @BehizyTweets: BREAKING: Elon Musk just reinstated Marjorie Taylor Greene's account https://t.co/4hwc46k6Yx,203,,,Mon Nov 21 18:11:59 +0000 2022,neutral
1594755520790937611,⚡️Gideon Henry⚡️🇺🇸🍊,GideonHenry,RT @w_terrence: Elon Musk should purchase the rights to the “ THE VIEW “on ABC. I would love to see the look on their faces and watch them…,3684,America,,Mon Nov 21 18:11:59 +0000 2022,positive
1594755521197785106,Leora Smoot,MomOfTwins57,"RT @DashDobrofsky: CNN's Jake Tapper asked Hakeem Jeffries what his ""reaction"" was to Donald Trump being reinstated on Twitter by Elon Musk…",772,,,Mon Nov 21 18:12:00 +0000 2022,neutral
1594755522191917056,Howard Lovy,Howard_Lovy,RT @JTAnews: Elon Musk bantered with Kanye West and trolled the Anti-Defamation League this weekend. https://t.co/zvKrrAOP5C,8820,"Traverse City, MI",,Mon Nov 21 18:12:00 +0000 2022,neutral
1594755524024746055,CNM MERCOSUL,CNMMERCOSUL,"RT @TradutordoBR: Jair Bolsonaro: ""Elon Musk, here they call me a myth, I don't know why, but you really are the myth of our freedom."" 🇧🇷🇺🇸…",1598,America do Sul,,Mon Nov 21 18:12:00 +0000 2022,positive


In [0]:
df2.count()

Out[157]: 328212

In [0]:
df2.select('sentiment').distinct().collect()

Out[160]: [Row(sentiment='positive'),
 Row(sentiment='neutral'),
 Row(sentiment='negative')]

In [0]:
df2.cache()

Out[161]: DataFrame[id: string, name: string, screen_name: string, tweet: string, followers_count: string, location: string, geo: string, created_at: string, sentiment: string]

### 6. Tweets Cleaning and Pre-processing

In [0]:
df_clean = df2.withColumn('tweet', F.regexp_replace('tweet', r"http\S+", "")) \
                    .withColumn('tweet', F.regexp_replace('tweet', '@\w+', '')) \
                    .withColumn('tweet', F.regexp_replace('tweet', '#', '')) \
                    .withColumn('tweet', F.regexp_replace('tweet', '#', '')) \
                    .withColumn('tweet', F.regexp_replace('tweet', 'RT', '')) \
                    .withColumn('tweet', F.regexp_replace('tweet', ':', '')) \
                    .withColumn('tweet', F.regexp_replace('tweet', r"[^a-zA-z]", " ")) \
                    .withColumn('tweet', F.regexp_replace('tweet', r"\s+", " ")) \
                    .withColumn('tweet', F.lower('tweet')) \
                    .withColumn('tweet', F.trim('tweet')) 
df_clean.cache()
display(df_clean.take(5))

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment
1594755513945755648,J.E. Dyer ☘️,OptimisticCon,keep in mind trump banned on twitter is an integral element of the j narrative that trump fomented insurrect,9016,,,Mon Nov 21 18:11:58 +0000 2022,neutral
1594755515485143052,Javier Perdomo,Javierperdomo,elon musk interacting with sycophantic right wing influencers this weekend a thread,469,,,Mon Nov 21 18:11:58 +0000 2022,neutral
1594755517574164481,Casey Reilley,caseyreilley,elon musk interacting with sycophantic right wing influencers this weekend a thread,180,"Hawaii, USA",,Mon Nov 21 18:11:59 +0000 2022,neutral
1594755519868043264,Val Ornelas,_surfcowgirl,i d like to make something else clear i don t work for twitter elon musk any government political party group etc i ru,195,Merica,,Mon Nov 21 18:11:59 +0000 2022,positive
1594755519981436961,Name Can't be Blank,adrenaline1073,just in elon musk has reinstated rep marjorie taylor greene s r ga personal twitter account,915,"North Coast, Ohio USA",,Mon Nov 21 18:11:59 +0000 2022,neutral


In [0]:
type(df_clean)

Out[191]: pyspark.sql.dataframe.DataFrame

In [0]:
df_clean.count()

Out[164]: 328212

#### Dropping the duplicate rows/tweets i.e Retweets

In [0]:
from pyspark.sql.functions import col
df_clean = df_clean.withColumn('tweet',col('tweet').cast('string'))
df_clean.drop_duplicates(subset=['tweet']).count()

Out[192]: 70623

In [0]:
df_clean_wd = df_clean.drop_duplicates(subset=['tweet'])
df_clean_wd.count()

Out[193]: 70623

In [0]:
display(df_clean_wd.take(10))

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment
1594873952895176709,Kenneth C. Davis,kennethcdavis,a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,7942,NYC,,Tue Nov 22 02:02:36 +0000 2022,neutral
1595103961673269250,Elon Musk Now,EMuskNow,a timeline of the elon musk donald trump twitter saga,3218,USA,,Tue Nov 22 17:16:34 +0000 2022,neutral
1595061999838240768,D1SoftBall News,D1softballN,after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,19,,,Tue Nov 22 14:29:50 +0000 2022,positive
1594896718877069312,Brian,bjs53bl,agreed let s get out the popcorn,2920,"Montana, USA",,Tue Nov 22 03:33:04 +0000 2022,neutral
1595075743121383424,Anita Loch,anita_loch,already on tribel and will consider that home in the future,375,,,Tue Nov 22 15:24:26 +0000 2022,neutral
1594758024136359936,Marker Animations,MarkerAnimation,alright folks time for a story when my account has been nuked people have been tricked that there s elon musk around a,657,Pillary Ruins,,Mon Nov 21 18:21:56 +0000 2022,neutral
1594828408692908039,Daniel Rubin,DanielYRubin,also recruiting for the content moderation council one can only assume,731,Queens via Toronto,,Mon Nov 21 23:01:37 +0000 2022,neutral
1594870566183653377,VINnews,VINNews,although his twitter account has been reinstated by elon musk president trump may face a big obstacle preventing h,14784,New York,,Tue Nov 22 01:49:08 +0000 2022,neutral
1595060838427156481,Bill Haverland,wjhaverland,and have him run apple into the ground too i still think you re a troll bot,16,"Raleigh, NC",,Tue Nov 22 14:25:13 +0000 2022,neutral
1595169963836379140,Cam30 🇺🇸 🙏 🇺🇸,pccote66,as all the hateful radical left s heads explode marjorietaylorgreene elon musk s twitter reinstates rep marjor,2609,USA 🇺🇸,,Tue Nov 22 21:38:50 +0000 2022,neutral


In [0]:
df_clean_wd.count()

Out[196]: 70623

In [0]:
df_clean_wd.cache()

Out[194]: DataFrame[id: string, name: string, screen_name: string, tweet: string, followers_count: string, location: string, geo: string, created_at: string, sentiment: string]

In [0]:
df_clean_wd.columns

Out[197]: ['id',
 'name',
 'screen_name',
 'tweet',
 'followers_count',
 'location',
 'geo',
 'created_at',
 'sentiment']

#### Feature Transformer: Tokenizer

In [0]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
tweets_tokenized = tokenizer.transform(df_clean_wd)

display(tweets_tokenized.take(5))

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment,tokens
1594873952895176709,Kenneth C. Davis,kennethcdavis,a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,7942,NYC,,Tue Nov 22 02:02:36 +0000 2022,neutral,"List(a, i, have, been, thinking, about, this, i, d, miss, the, community, of, teachers, here, but, recent, elon, musk, decisions, have, left, me)"
1595103961673269250,Elon Musk Now,EMuskNow,a timeline of the elon musk donald trump twitter saga,3218,USA,,Tue Nov 22 17:16:34 +0000 2022,neutral,"List(a, timeline, of, the, elon, musk, donald, trump, twitter, saga)"
1595061999838240768,D1SoftBall News,D1softballN,after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,19,,,Tue Nov 22 14:29:50 +0000 2022,positive,"List(after, another, wave, of, layoffs, on, twitter, elon, musk, assured, that, he, will, hire, a, new, batch, of, employees)"
1594896718877069312,Brian,bjs53bl,agreed let s get out the popcorn,2920,"Montana, USA",,Tue Nov 22 03:33:04 +0000 2022,neutral,"List(agreed, let, s, get, out, the, popcorn)"
1595075743121383424,Anita Loch,anita_loch,already on tribel and will consider that home in the future,375,,,Tue Nov 22 15:24:26 +0000 2022,neutral,"List(already, on, tribel, and, will, consider, that, home, in, the, future)"


#### Feature Transformer: StopWord Removal

In [0]:
#now remove stopwords from the review(list of words)    
from pyspark.ml.feature import StopWordsRemover

stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tweets_stopword = stopword_remover.transform(tweets_tokenized)

display(tweets_stopword.take(5))

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment,tokens,filtered
1594873952895176709,Kenneth C. Davis,kennethcdavis,a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,7942,NYC,,Tue Nov 22 02:02:36 +0000 2022,neutral,"List(a, i, have, been, thinking, about, this, i, d, miss, the, community, of, teachers, here, but, recent, elon, musk, decisions, have, left, me)","List(thinking, d, miss, community, teachers, recent, elon, musk, decisions, left)"
1595103961673269250,Elon Musk Now,EMuskNow,a timeline of the elon musk donald trump twitter saga,3218,USA,,Tue Nov 22 17:16:34 +0000 2022,neutral,"List(a, timeline, of, the, elon, musk, donald, trump, twitter, saga)","List(timeline, elon, musk, donald, trump, twitter, saga)"
1595061999838240768,D1SoftBall News,D1softballN,after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,19,,,Tue Nov 22 14:29:50 +0000 2022,positive,"List(after, another, wave, of, layoffs, on, twitter, elon, musk, assured, that, he, will, hire, a, new, batch, of, employees)","List(another, wave, layoffs, twitter, elon, musk, assured, hire, new, batch, employees)"
1594896718877069312,Brian,bjs53bl,agreed let s get out the popcorn,2920,"Montana, USA",,Tue Nov 22 03:33:04 +0000 2022,neutral,"List(agreed, let, s, get, out, the, popcorn)","List(agreed, let, get, popcorn)"
1595075743121383424,Anita Loch,anita_loch,already on tribel and will consider that home in the future,375,,,Tue Nov 22 15:24:26 +0000 2022,neutral,"List(already, on, tribel, and, will, consider, that, home, in, the, future)","List(already, tribel, consider, home, future)"


#### Feature Transformer: CountVectorizer (TF - Term Frequency)

In [0]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
cv_model = cv.fit(tweets_stopword)
tweets_cv = cv_model.transform(tweets_stopword)

display(tweets_cv.take(5))

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment,tokens,filtered,cv
1594873952895176709,Kenneth C. Davis,kennethcdavis,a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,7942,NYC,,Tue Nov 22 02:02:36 +0000 2022,neutral,"List(a, i, have, been, thinking, about, this, i, d, miss, the, community, of, teachers, here, but, recent, elon, musk, decisions, have, left, me)","List(thinking, d, miss, community, teachers, recent, elon, musk, decisions, left)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
1595103961673269250,Elon Musk Now,EMuskNow,a timeline of the elon musk donald trump twitter saga,3218,USA,,Tue Nov 22 17:16:34 +0000 2022,neutral,"List(a, timeline, of, the, elon, musk, donald, trump, twitter, saga)","List(timeline, elon, musk, donald, trump, twitter, saga)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
1595061999838240768,D1SoftBall News,D1softballN,after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,19,,,Tue Nov 22 14:29:50 +0000 2022,positive,"List(after, another, wave, of, layoffs, on, twitter, elon, musk, assured, that, he, will, hire, a, new, batch, of, employees)","List(another, wave, layoffs, twitter, elon, musk, assured, hire, new, batch, employees)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
1594896718877069312,Brian,bjs53bl,agreed let s get out the popcorn,2920,"Montana, USA",,Tue Nov 22 03:33:04 +0000 2022,neutral,"List(agreed, let, s, get, out, the, popcorn)","List(agreed, let, get, popcorn)","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(1.0, 1.0, 1.0, 1.0))"
1595075743121383424,Anita Loch,anita_loch,already on tribel and will consider that home in the future,375,,,Tue Nov 22 15:24:26 +0000 2022,neutral,"List(already, on, tribel, and, will, consider, that, home, in, the, future)","List(already, tribel, consider, home, future)","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))"


#### Feature Transformer: TF-IDF Vectorization

In [0]:
from pyspark.ml.feature import HashingTF, IDF

idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
idf_model = idf.fit(tweets_cv)
tweets_idf = idf_model.transform(tweets_cv)

display(tweets_idf)

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment,tokens,filtered,cv,features
1594873952895176709,Kenneth C. Davis,kennethcdavis,a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,7942,NYC,,Tue Nov 22 02:02:36 +0000 2022,neutral,"List(a, i, have, been, thinking, about, this, i, d, miss, the, community, of, teachers, here, but, recent, elon, musk, decisions, have, left, me)","List(thinking, d, miss, community, teachers, recent, elon, musk, decisions, left)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(0.5391502441837212, 0.5480508967229336, 4.361620051448125, 4.449741922721782, 5.616049224161243, 6.482994081932243, 6.492296474594557, 6.632525815903207, 7.006242225696791, 8.680218659268462))"
1595103961673269250,Elon Musk Now,EMuskNow,a timeline of the elon musk donald trump twitter saga,3218,USA,,Tue Nov 22 17:16:34 +0000 2022,neutral,"List(a, timeline, of, the, elon, musk, donald, trump, twitter, saga)","List(timeline, elon, musk, donald, trump, twitter, saga)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 2.826580429067884, 3.860609362596308, 6.259850530618033, 8.029631093127314))"
1595061999838240768,D1SoftBall News,D1softballN,after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,19,,,Tue Nov 22 14:29:50 +0000 2022,positive,"List(after, another, wave, of, layoffs, on, twitter, elon, musk, assured, that, he, will, hire, a, new, batch, of, employees)","List(another, wave, layoffs, twitter, elon, musk, assured, hire, new, batch, employees)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 3.7517579733612227, 3.8249384737363474, 4.82653123085328, 5.312922829281988, 5.881921580318474, 7.293924298148572, 0.0, 0.0))"
1594896718877069312,Brian,bjs53bl,agreed let s get out the popcorn,2920,"Montana, USA",,Tue Nov 22 03:33:04 +0000 2022,neutral,"List(agreed, let, s, get, out, the, popcorn)","List(agreed, let, get, popcorn)","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(3.6954711361243344, 4.2806386570136805, 7.380935675138201, 8.862540216062417))"
1595075743121383424,Anita Loch,anita_loch,already on tribel and will consider that home in the future,375,,,Tue Nov 22 15:24:26 +0000 2022,neutral,"List(already, on, tribel, and, will, consider, that, home, in, the, future)","List(already, tribel, consider, home, future)","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(5.193863469266001, 5.705539794912304, 6.305312904694791, 6.611248417455922, 7.427455690773095))"
1594758024136359936,Marker Animations,MarkerAnimation,alright folks time for a story when my account has been nuked people have been tricked that there s elon musk around a,657,Pillary Ruins,,Mon Nov 21 18:21:56 +0000 2022,neutral,"List(alright, folks, time, for, a, story, when, my, account, has, been, nuked, people, have, been, tricked, that, there, s, elon, musk, around, a)","List(alright, folks, time, story, account, nuked, people, tricked, elon, musk, around)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 5, 9, 22, 255, 336, 525, 2695, 5255, 10274), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 5, 9, 22, 255, 336, 525, 2695, 5255, 10274), values -> List(0.5391502441837212, 0.5480508967229336, 2.9839644510330534, 3.6123632248423156, 3.869390236307181, 5.348014149093259, 5.592971276878698, 6.006070009841934, 7.763927927394308, 8.680218659268462, 0.0))"
1594828408692908039,Daniel Rubin,DanielYRubin,also recruiting for the content moderation council one can only assume,731,Queens via Toronto,,Mon Nov 21 23:01:37 +0000 2022,neutral,"List(also, recruiting, for, the, content, moderation, council, one, can, only, assume)","List(also, recruiting, content, moderation, council, one, assume)","Map(vectorType -> sparse, length -> 33920, indices -> List(7, 59, 150, 470, 803, 2276, 2457), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(7, 59, 150, 470, 803, 2276, 2457), values -> List(3.427073011367147, 4.356086003013483, 4.987181195005863, 5.934016692201876, 6.420193180693213, 7.554207396412238, 7.638764784440301))"
1594870566183653377,VINnews,VINNews,although his twitter account has been reinstated by elon musk president trump may face a big obstacle preventing h,14784,New York,,Tue Nov 22 01:49:08 +0000 2022,neutral,"List(although, his, twitter, account, has, been, reinstated, by, elon, musk, president, trump, may, face, a, big, obstacle, preventing, h)","List(although, twitter, account, reinstated, elon, musk, president, trump, may, face, big, obstacle, preventing, h)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 9, 109, 110, 125, 145, 217, 462, 2328, 5595, 15909), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 9, 109, 110, 125, 145, 217, 462, 2328, 5595, 15909), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 2.826580429067884, 3.6123632248423156, 4.779930910058738, 4.759896851025622, 4.857026867546932, 4.956535282959834, 5.243546889412648, 5.902435120151577, 7.581606370600353, 8.767230036258093, 0.0))"
1595060838427156481,Bill Haverland,wjhaverland,and have him run apple into the ground too i still think you re a troll bot,16,"Raleigh, NC",,Tue Nov 22 14:25:13 +0000 2022,neutral,"List(and, have, him, run, apple, into, the, ground, too, i, still, think, you, re, a, troll, bot)","List(run, apple, ground, still, think, re, troll, bot)","Map(vectorType -> sparse, length -> 33920, indices -> List(10, 11, 45, 121, 180, 616, 791, 1001), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(10, 11, 45, 121, 180, 616, 791, 1001), values -> List(3.6346453138109265, 3.696612037560126, 4.245441459209053, 4.892248302510295, 5.096699720812353, 6.174692722277727, 6.420193180693213, 6.621830526786459))"
1595169963836379140,Cam30 🇺🇸 🙏 🇺🇸,pccote66,as all the hateful radical left s heads explode marjorietaylorgreene elon musk s twitter reinstates rep marjor,2609,USA 🇺🇸,,Tue Nov 22 21:38:50 +0000 2022,neutral,"List(as, all, the, hateful, radical, left, s, heads, explode, marjorietaylorgreene, elon, musk, s, twitter, reinstates, rep, marjor)","List(hateful, radical, left, heads, explode, marjorietaylorgreene, elon, musk, twitter, reinstates, rep, marjor)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 71, 244, 481, 1360, 1731, 1757, 4817, 10328, 30873), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 71, 244, 481, 1360, 1731, 1757, 4817, 10328, 30873), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 4.449741922721782, 5.318686533998739, 5.918101236895977, 6.960432689665497, 7.2332996763321376, 7.293924298148572, 8.526067979441205, 0.0, 0.0))"


#### Label Encoder

In [0]:
from pyspark.ml.feature import StringIndexer

label_encoder = StringIndexer(inputCol = "sentiment", outputCol = "label")
le_model = label_encoder.fit(tweets_idf)
tweets_label = le_model.transform(tweets_idf)

display(tweets_label)

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment,tokens,filtered,cv,features,label
1594873952895176709,Kenneth C. Davis,kennethcdavis,a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,7942,NYC,,Tue Nov 22 02:02:36 +0000 2022,neutral,"List(a, i, have, been, thinking, about, this, i, d, miss, the, community, of, teachers, here, but, recent, elon, musk, decisions, have, left, me)","List(thinking, d, miss, community, teachers, recent, elon, musk, decisions, left)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(0.5391502441837212, 0.5480508967229336, 4.361620051448125, 4.449741922721782, 5.616049224161243, 6.482994081932243, 6.492296474594557, 6.632525815903207, 7.006242225696791, 8.680218659268462))",0.0
1595103961673269250,Elon Musk Now,EMuskNow,a timeline of the elon musk donald trump twitter saga,3218,USA,,Tue Nov 22 17:16:34 +0000 2022,neutral,"List(a, timeline, of, the, elon, musk, donald, trump, twitter, saga)","List(timeline, elon, musk, donald, trump, twitter, saga)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 2.826580429067884, 3.860609362596308, 6.259850530618033, 8.029631093127314))",0.0
1595061999838240768,D1SoftBall News,D1softballN,after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,19,,,Tue Nov 22 14:29:50 +0000 2022,positive,"List(after, another, wave, of, layoffs, on, twitter, elon, musk, assured, that, he, will, hire, a, new, batch, of, employees)","List(another, wave, layoffs, twitter, elon, musk, assured, hire, new, batch, employees)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 3.7517579733612227, 3.8249384737363474, 4.82653123085328, 5.312922829281988, 5.881921580318474, 7.293924298148572, 0.0, 0.0))",1.0
1594896718877069312,Brian,bjs53bl,agreed let s get out the popcorn,2920,"Montana, USA",,Tue Nov 22 03:33:04 +0000 2022,neutral,"List(agreed, let, s, get, out, the, popcorn)","List(agreed, let, get, popcorn)","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(3.6954711361243344, 4.2806386570136805, 7.380935675138201, 8.862540216062417))",0.0
1595075743121383424,Anita Loch,anita_loch,already on tribel and will consider that home in the future,375,,,Tue Nov 22 15:24:26 +0000 2022,neutral,"List(already, on, tribel, and, will, consider, that, home, in, the, future)","List(already, tribel, consider, home, future)","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(5.193863469266001, 5.705539794912304, 6.305312904694791, 6.611248417455922, 7.427455690773095))",0.0
1594758024136359936,Marker Animations,MarkerAnimation,alright folks time for a story when my account has been nuked people have been tricked that there s elon musk around a,657,Pillary Ruins,,Mon Nov 21 18:21:56 +0000 2022,neutral,"List(alright, folks, time, for, a, story, when, my, account, has, been, nuked, people, have, been, tricked, that, there, s, elon, musk, around, a)","List(alright, folks, time, story, account, nuked, people, tricked, elon, musk, around)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 5, 9, 22, 255, 336, 525, 2695, 5255, 10274), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 5, 9, 22, 255, 336, 525, 2695, 5255, 10274), values -> List(0.5391502441837212, 0.5480508967229336, 2.9839644510330534, 3.6123632248423156, 3.869390236307181, 5.348014149093259, 5.592971276878698, 6.006070009841934, 7.763927927394308, 8.680218659268462, 0.0))",0.0
1594828408692908039,Daniel Rubin,DanielYRubin,also recruiting for the content moderation council one can only assume,731,Queens via Toronto,,Mon Nov 21 23:01:37 +0000 2022,neutral,"List(also, recruiting, for, the, content, moderation, council, one, can, only, assume)","List(also, recruiting, content, moderation, council, one, assume)","Map(vectorType -> sparse, length -> 33920, indices -> List(7, 59, 150, 470, 803, 2276, 2457), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(7, 59, 150, 470, 803, 2276, 2457), values -> List(3.427073011367147, 4.356086003013483, 4.987181195005863, 5.934016692201876, 6.420193180693213, 7.554207396412238, 7.638764784440301))",0.0
1594870566183653377,VINnews,VINNews,although his twitter account has been reinstated by elon musk president trump may face a big obstacle preventing h,14784,New York,,Tue Nov 22 01:49:08 +0000 2022,neutral,"List(although, his, twitter, account, has, been, reinstated, by, elon, musk, president, trump, may, face, a, big, obstacle, preventing, h)","List(although, twitter, account, reinstated, elon, musk, president, trump, may, face, big, obstacle, preventing, h)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 9, 109, 110, 125, 145, 217, 462, 2328, 5595, 15909), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 9, 109, 110, 125, 145, 217, 462, 2328, 5595, 15909), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 2.826580429067884, 3.6123632248423156, 4.779930910058738, 4.759896851025622, 4.857026867546932, 4.956535282959834, 5.243546889412648, 5.902435120151577, 7.581606370600353, 8.767230036258093, 0.0))",0.0
1595060838427156481,Bill Haverland,wjhaverland,and have him run apple into the ground too i still think you re a troll bot,16,"Raleigh, NC",,Tue Nov 22 14:25:13 +0000 2022,neutral,"List(and, have, him, run, apple, into, the, ground, too, i, still, think, you, re, a, troll, bot)","List(run, apple, ground, still, think, re, troll, bot)","Map(vectorType -> sparse, length -> 33920, indices -> List(10, 11, 45, 121, 180, 616, 791, 1001), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(10, 11, 45, 121, 180, 616, 791, 1001), values -> List(3.6346453138109265, 3.696612037560126, 4.245441459209053, 4.892248302510295, 5.096699720812353, 6.174692722277727, 6.420193180693213, 6.621830526786459))",0.0
1595169963836379140,Cam30 🇺🇸 🙏 🇺🇸,pccote66,as all the hateful radical left s heads explode marjorietaylorgreene elon musk s twitter reinstates rep marjor,2609,USA 🇺🇸,,Tue Nov 22 21:38:50 +0000 2022,neutral,"List(as, all, the, hateful, radical, left, s, heads, explode, marjorietaylorgreene, elon, musk, s, twitter, reinstates, rep, marjor)","List(hateful, radical, left, heads, explode, marjorietaylorgreene, elon, musk, twitter, reinstates, rep, marjor)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 71, 244, 481, 1360, 1731, 1757, 4817, 10328, 30873), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 71, 244, 481, 1360, 1731, 1757, 4817, 10328, 30873), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 4.449741922721782, 5.318686533998739, 5.918101236895977, 6.960432689665497, 7.2332996763321376, 7.293924298148572, 8.526067979441205, 0.0, 0.0))",0.0


In [0]:
tweets_label.columns

Out[204]: ['id',
 'name',
 'screen_name',
 'tweet',
 'followers_count',
 'location',
 'geo',
 'created_at',
 'sentiment',
 'tokens',
 'filtered',
 'cv',
 'features',
 'label']

## 7. Model Training: Logistic Regression Classifier

In [0]:
# Final DataFrame for Sentiment Analysis
import pyspark.sql.functions as F

tweets_label_ml = tweets_label.select('tweet', 'sentiment', 'tokens', 'filtered', 'cv', 'features', 'label')
display(tweets_label_ml.take(5))

tweet,sentiment,tokens,filtered,cv,features,label
a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,neutral,"List(a, i, have, been, thinking, about, this, i, d, miss, the, community, of, teachers, here, but, recent, elon, musk, decisions, have, left, me)","List(thinking, d, miss, community, teachers, recent, elon, musk, decisions, left)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(0.5391502441837212, 0.5480508967229336, 4.361620051448125, 4.449741922721782, 5.616049224161243, 6.482994081932243, 6.492296474594557, 6.632525815903207, 7.006242225696791, 8.680218659268462))",0.0
a timeline of the elon musk donald trump twitter saga,neutral,"List(a, timeline, of, the, elon, musk, donald, trump, twitter, saga)","List(timeline, elon, musk, donald, trump, twitter, saga)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 2.826580429067884, 3.860609362596308, 6.259850530618033, 8.029631093127314))",0.0
after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,positive,"List(after, another, wave, of, layoffs, on, twitter, elon, musk, assured, that, he, will, hire, a, new, batch, of, employees)","List(another, wave, layoffs, twitter, elon, musk, assured, hire, new, batch, employees)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 3.7517579733612227, 3.8249384737363474, 4.82653123085328, 5.312922829281988, 5.881921580318474, 7.293924298148572, 0.0, 0.0))",1.0
agreed let s get out the popcorn,neutral,"List(agreed, let, s, get, out, the, popcorn)","List(agreed, let, get, popcorn)","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(3.6954711361243344, 4.2806386570136805, 7.380935675138201, 8.862540216062417))",0.0
already on tribel and will consider that home in the future,neutral,"List(already, on, tribel, and, will, consider, that, home, in, the, future)","List(already, tribel, consider, home, future)","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(5.193863469266001, 5.705539794912304, 6.305312904694791, 6.611248417455922, 7.427455690773095))",0.0


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=100)

lr_model = lr.fit(tweets_label_ml)

predictions = lr_model.transform(tweets_label_ml)

display(predictions)

tweet,sentiment,tokens,filtered,cv,features,label,rawPrediction,probability,prediction
a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,neutral,"List(a, i, have, been, thinking, about, this, i, d, miss, the, community, of, teachers, here, but, recent, elon, musk, decisions, have, left, me)","List(thinking, d, miss, community, teachers, recent, elon, musk, decisions, left)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 53, 71, 351, 865, 876, 998, 1433, 5009), values -> List(0.5391502441837212, 0.5480508967229336, 4.361620051448125, 4.449741922721782, 5.616049224161243, 6.482994081932243, 6.492296474594557, 6.632525815903207, 7.006242225696791, 8.680218659268462))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(6.484675999591623, 1.4442868499623307, -7.928962849553954))","Map(vectorType -> dense, length -> 3, values -> List(0.9935698352164646, 0.006429618479282346, 5.463042529914101E-7))",0.0
a timeline of the elon musk donald trump twitter saga,neutral,"List(a, timeline, of, the, elon, musk, donald, trump, twitter, saga)","List(timeline, elon, musk, donald, trump, twitter, saga)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 21, 659, 3305), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 2.826580429067884, 3.860609362596308, 6.259850530618033, 8.029631093127314))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(4.395944941203777, -0.853811422187606, -3.542133519016171))","Map(vectorType -> dense, length -> 3, values -> List(0.9944255599316034, 0.005219537937978928, 3.5490213041757337E-4))",0.0
after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,positive,"List(after, another, wave, of, layoffs, on, twitter, elon, musk, assured, that, he, will, hire, a, new, batch, of, employees)","List(another, wave, layoffs, twitter, elon, musk, assured, hire, new, batch, employees)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 17, 20, 123, 240, 448, 1823, 10605, 12831), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 3.7517579733612227, 3.8249384737363474, 4.82653123085328, 5.312922829281988, 5.881921580318474, 7.293924298148572, 0.0, 0.0))",1.0,"Map(vectorType -> dense, length -> 3, values -> List(-3.6474952349060428, 3.773038699514242, -0.12554346460819898))","Map(vectorType -> dense, length -> 3, values -> List(5.865875584746385E-4, 0.979557170005906, 0.019856242435619304))",1.0
agreed let s get out the popcorn,neutral,"List(agreed, let, s, get, out, the, popcorn)","List(agreed, let, get, popcorn)","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(12, 47, 1982, 5880), values -> List(3.6954711361243344, 4.2806386570136805, 7.380935675138201, 8.862540216062417))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(8.441351870923373, -3.063485703458451, -5.377866167464921))","Map(vectorType -> dense, length -> 3, values -> List(0.9999889226164665, 1.0081095185914276E-5, 9.96288347405531E-7))",0.0
already on tribel and will consider that home in the future,neutral,"List(already, on, tribel, and, will, consider, that, home, in, the, future)","List(already, tribel, consider, home, future)","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(206, 378, 711, 990, 2053), values -> List(5.193863469266001, 5.705539794912304, 6.305312904694791, 6.611248417455922, 7.427455690773095))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.920453833705979, 0.1921462426666798, -3.112600076372659))","Map(vectorType -> dense, length -> 3, values -> List(0.9365681850255099, 0.06118577553169998, 0.0022460394427902507))",0.0
alright folks time for a story when my account has been nuked people have been tricked that there s elon musk around a,neutral,"List(alright, folks, time, for, a, story, when, my, account, has, been, nuked, people, have, been, tricked, that, there, s, elon, musk, around, a)","List(alright, folks, time, story, account, nuked, people, tricked, elon, musk, around)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 5, 9, 22, 255, 336, 525, 2695, 5255, 10274), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 5, 9, 22, 255, 336, 525, 2695, 5255, 10274), values -> List(0.5391502441837212, 0.5480508967229336, 2.9839644510330534, 3.6123632248423156, 3.869390236307181, 5.348014149093259, 5.592971276878698, 6.006070009841934, 7.763927927394308, 8.680218659268462, 0.0))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.2554190524355686, -0.6728549826136969, -1.5825640698218715))","Map(vectorType -> dense, length -> 3, values -> List(0.9302098227753258, 0.049756252870104524, 0.02003392435456957))",0.0
also recruiting for the content moderation council one can only assume,neutral,"List(also, recruiting, for, the, content, moderation, council, one, can, only, assume)","List(also, recruiting, content, moderation, council, one, assume)","Map(vectorType -> sparse, length -> 33920, indices -> List(7, 59, 150, 470, 803, 2276, 2457), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(7, 59, 150, 470, 803, 2276, 2457), values -> List(3.427073011367147, 4.356086003013483, 4.987181195005863, 5.934016692201876, 6.420193180693213, 7.554207396412238, 7.638764784440301))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(1.5970666859277993, 0.6143992695338403, -2.211465955461639))","Map(vectorType -> dense, length -> 3, values -> List(0.7160799938748421, 0.2680368490769862, 0.015883157048171635))",0.0
although his twitter account has been reinstated by elon musk president trump may face a big obstacle preventing h,neutral,"List(although, his, twitter, account, has, been, reinstated, by, elon, musk, president, trump, may, face, a, big, obstacle, preventing, h)","List(although, twitter, account, reinstated, elon, musk, president, trump, may, face, big, obstacle, preventing, h)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 9, 109, 110, 125, 145, 217, 462, 2328, 5595, 15909), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 4, 9, 109, 110, 125, 145, 217, 462, 2328, 5595, 15909), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 2.826580429067884, 3.6123632248423156, 4.779930910058738, 4.759896851025622, 4.857026867546932, 4.956535282959834, 5.243546889412648, 5.902435120151577, 7.581606370600353, 8.767230036258093, 0.0))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(3.679054448977981, -1.3498233025924247, -2.3292311463855557))","Map(vectorType -> dense, length -> 3, values -> List(0.9910759046391162, 0.006487734367242794, 0.0024363609936409996))",0.0
and have him run apple into the ground too i still think you re a troll bot,neutral,"List(and, have, him, run, apple, into, the, ground, too, i, still, think, you, re, a, troll, bot)","List(run, apple, ground, still, think, re, troll, bot)","Map(vectorType -> sparse, length -> 33920, indices -> List(10, 11, 45, 121, 180, 616, 791, 1001), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(10, 11, 45, 121, 180, 616, 791, 1001), values -> List(3.6346453138109265, 3.696612037560126, 4.245441459209053, 4.892248302510295, 5.096699720812353, 6.174692722277727, 6.420193180693213, 6.621830526786459))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(2.608085343556803, -1.1776997198003003, -1.4303856237565027))","Map(vectorType -> dense, length -> 3, values -> List(0.9612469061234653, 0.021811693800856256, 0.016941400075678337))",0.0
as all the hateful radical left s heads explode marjorietaylorgreene elon musk s twitter reinstates rep marjor,neutral,"List(as, all, the, hateful, radical, left, s, heads, explode, marjorietaylorgreene, elon, musk, s, twitter, reinstates, rep, marjor)","List(hateful, radical, left, heads, explode, marjorietaylorgreene, elon, musk, twitter, reinstates, rep, marjor)","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 71, 244, 481, 1360, 1731, 1757, 4817, 10328, 30873), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 33920, indices -> List(0, 1, 2, 71, 244, 481, 1360, 1731, 1757, 4817, 10328, 30873), values -> List(0.5391502441837212, 0.5480508967229336, 1.3782839386396553, 4.449741922721782, 5.318686533998739, 5.918101236895977, 6.960432689665497, 7.2332996763321376, 7.293924298148572, 8.526067979441205, 0.0, 0.0))",0.0,"Map(vectorType -> dense, length -> 3, values -> List(5.731876484515512, -6.778940220490224, 1.047063735974714))","Map(vectorType -> dense, length -> 3, values -> List(0.9908464120427851, 3.6528150375108238E-6, 0.00914993514217733))",0.0


## 8. Model Evaluation

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
roc_auc = evaluator.evaluate(predictions)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
roc_auc = evaluator.evaluate(predictions)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(tweets_label_ml.count())
print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))
evaluator.evaluate(predictions)

Accuracy Score: 0.9574
ROC-AUC: 0.9573
Out[210]: 0.9573264953904316

### 8. Saving the predictions and twitter clean dataset to My S3 Bucket

In [0]:
%fs ls /mnt/

path,name,size,modificationTime
dbfs:/mnt/anonymous_telecom/,anonymous_telecom/,0,0
dbfs:/mnt/bikeshare/,bikeshare/,0,0
dbfs:/mnt/cdr/,cdr/,0,0
dbfs:/mnt/movie/,movie/,0,0
dbfs:/mnt/my_bucket/,my_bucket/,0,0
dbfs:/mnt/paysim_fraud/,paysim_fraud/,0,0
dbfs:/mnt/shakespeare/,shakespeare/,0,0
dbfs:/mnt/topics/,topics/,0,0
dbfs:/mnt/twitter/,twitter/,0,0
dbfs:/mnt/wikipedia/,wikipedia/,0,0


In [0]:
(predictions.write
 .parquet('/mnt/my_bucket/twitter_predictions.parquet')
)

In [0]:
dataset = tweets_label.select('id', 'name', 'screen_name', 'tweet', 'followers_count', 'location', 'geo', 'created_at', 'sentiment', 'label')
display(dataset.take(5))

id,name,screen_name,tweet,followers_count,location,geo,created_at,sentiment,label
1594873952895176709,Kenneth C. Davis,kennethcdavis,a i have been thinking about this i d miss the community of teachers here but recent elon musk decisions have left me,7942,NYC,,Tue Nov 22 02:02:36 +0000 2022,neutral,0.0
1595103961673269250,Elon Musk Now,EMuskNow,a timeline of the elon musk donald trump twitter saga,3218,USA,,Tue Nov 22 17:16:34 +0000 2022,neutral,0.0
1595061999838240768,D1SoftBall News,D1softballN,after another wave of layoffs on twitter elon musk assured that he will hire a new batch of employees,19,,,Tue Nov 22 14:29:50 +0000 2022,positive,1.0
1594896718877069312,Brian,bjs53bl,agreed let s get out the popcorn,2920,"Montana, USA",,Tue Nov 22 03:33:04 +0000 2022,neutral,0.0
1595075743121383424,Anita Loch,anita_loch,already on tribel and will consider that home in the future,375,,,Tue Nov 22 15:24:26 +0000 2022,neutral,0.0


In [0]:
(dataset
 .write
 .option('header', 'true')
 .option('delimiter', '\t')
 .csv('/mnt/my_bucket/clean_twitter_dataset.csv'))

In [0]:
display(predictions)

In [0]:
# date_parse(created_at, ‘%a %b %d %H:%I:%s +000 %Y’) as new_date