# Analysis of Global Warming Tweets on January 2023.
This proejct is based on global warming tweets tweeted on January 2023 and stored in parquet format. I have added emotion (anger, joy, opotimism and sadness) and gender at the end of each tweets. Emotion analysis was performed using a pre-trained model from Hugging Face (Twitter-roBERTa-base for Emotion Recognition). This is a roBERTa-base model trained on ~58M tweets and finetuned for emotion recognition with the TweetEval benchmark. Each tweet is classified into four emotions (joy, optimism, anger, and sadness) with a confidence score. In addition, gender is extracted based on first name of user account if a user account has a real first name and the gender can be identified by python package gender guesser. 

# Read tweets into a data frame

In [4]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from helper_functions import displayByGroup
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, explode, desc

# check if the Spark session is active. If it is activate, close it

try:
    if spark:
        spark.stop()
except:
    pass    

spark = (SparkSession.builder.appName("Global Warming Tweets Analysis")
        .config("spark.port.maxRetries", "100")
        .config("spark.sql.mapKeyDedupPolicy", "LAST_WIN")  # This configuration allow the duplicate keys in the map data type.
#        .config("spark.driver.memory", "16g")
        .getOrCreate())

# confiture the log level (defaulty is WWARN)
spark.sparkContext.setLogLevel('ERROR')

# read the global warming tweets

df=spark.read.parquet('/opt/shared/globalwarming_202301')

# In class exercise

## Extract mentions of each Twitter account given.

In [2]:
df.select('author').printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- description: struct (nullable = true)
 |    |    |    |-- cashtags: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |    |-- tag: string (nullable = true)
 |    |    |    |-- hashtags: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |    |-- tag: string (nullable = true)
 |    |    |    |-- mentions: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)

In [3]:
df.select('author.entities.description.mentions').printSchema()

root
 |-- mentions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- end: long (nullable = true)
 |    |    |-- start: long (nullable = true)
 |    |    |-- username: string (nullable = true)



In [9]:
df2=df1.select('username', explode('mentions').alias('mentions'))

df2.orderBy('username').show()



+-----------+---------------+
|   username|       mentions|
+-----------+---------------+
|   00Meeach|           USMC|
|   0912abhi|  MmmutOfficial|
| 0Cuculainn|           jack|
|  0HG0D0HN0|    domina_jinx|
|   0mega001|     KaCyberApp|
|0rvicBuilds|    TheDailyBxB|
| 0xGamer030|      Solar_Dex|
| 0xGamer030|      Solar_Dex|
|      0xLMC|TsunamiFinance_|
|      0xLMC| thewolvesgroup|
|      0xLMC|  emptyholdings|
|   0xMojojo|      Pak_Gregg|
|   0xMojojo|       jayefunk|
|   0xMojojo|     Goblintown|
|    0xPetra|         zkMaps|
|   0xVodnik|  cryptotitvags|
|   0xVodnik| cryptobirbsnft|
|   0xVodnik|    shroomionft|
|      0xak_|       stanford|
|      0xak_|     cyberkongz|
+-----------+---------------+
only showing top 20 rows



                                                                                

In [7]:
df1=df.select('author.username', col('author.entities.description.mentions.username').alias('mentions')).filter(F.col('mentions').isNotNull())

df1.show()

+---------------+--------------------+
|       username|            mentions|
+---------------+--------------------+
|  SueMasonBurns|   [jayjayjjetplane]|
|     thebeachll|        [DAlSlESSSS]|
|brucecharityorg|       [BrucewilzUg]|
|   gabi_spilker|[EXCInequality, U...|
|       tribblez|          [theauxdj]|
| JavidGillaniMH|     [muslimhandspk]|
|   AngryGustavo|      [Polandballbr]|
|  dwallacewells|[nytopinion, NYTmag]|
| WillKochtitzky|         [unetweets]|
|      AbeeraMR_|          [sosart90]|
|          Tyrnn|     [BoomerExpress]|
|   sapnadeora93|[digvijaya_28, Ka...|
|  JacobsOSeaman|      [DailyMonitor]|
|DiamondGBraxton|[bestmicrofic, fo...|
|   Jane_Munroe_|      [jane_munroe_]|
|       trumphop|             [pbump]|
|    SeaPupEllie|   [Nina_once_again]|
|    nj_morrison|[AFP, EUDataNewsH...|
|       mmm_soup|           [dvd_mcf]|
|     PtlbSchool|[PTLBSchools, Str...|
+---------------+--------------------+
only showing top 20 rows



### In Calss Top 5 users who mentioned most people

### In Class Top 5 users who received most mentions

## extract user location from tweets

In [15]:
df.select('author.location').show(10, False)

+------------------------------+
|location                      |
+------------------------------+
|Manchester, England           |
|Down very long track.         |
|null                          |
|Santa Ana, California         |
|null                          |
|Morgantown, WV                |
|null                          |
|null                          |
|U.S.A.                        |
|LORD HIS EXCELLENCY JAMES HRMH|
+------------------------------+
only showing top 10 rows



### In Class Display Top 5 user location

In [18]:
df.select('author').printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- description: struct (nullable = true)
 |    |    |    |-- cashtags: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |    |-- tag: string (nullable = true)
 |    |    |    |-- hashtags: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |    |-- tag: string (nullable = true)
 |    |    |    |-- mentions: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)

## Extract hashtags from tweets

In [25]:
df.select('entities.hashtags.tag').show()

+--------------------+
|                 tag|
+--------------------+
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|[GretaThunberg, G...|
|                null|
|                null|
|                null|
|                null|
|                null|
|[climate, change,...|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
+--------------------+
only showing top 20 rows



## Extract Entity (place, person, organizations) from tweets

In [26]:
df.select('entities.annotations.normalized_text', 'entities.annotations.type').show()

+--------------------+--------------+
|     normalized_text|          type|
+--------------------+--------------+
|                null|          null|
|                null|          null|
|[America, Pikas N...|[Place, Other]|
|              [Elon]|      [Person]|
|                null|          null|
|                null|          null|
|     [GretaThunberg]|      [Person]|
| [Bible, Revelation]|[Other, Other]|
|           [Florida]|       [Place]|
|        [Ice-Age, -]|[Other, Other]|
|                null|          null|
|              [IPCC]|[Organization]|
|     [MammothSteppe]|       [Place]|
|                null|          null|
|                null|          null|
|[Ice Road Trucker...|[Other, Other]|
|                null|          null|
|    [Global Warming]|       [Other]|
|                null|          null|
|                null|          null|
+--------------------+--------------+
only showing top 20 rows



In [27]:
# explode two arrays into key value pair

df1=df.select(F.map_from_arrays('entities.annotations.normalized_text', 'entities.annotations.type').alias('entities'))

# explode key value pair

df1.select(explode('entities')).show()

+--------------------+------------+
|                 key|       value|
+--------------------+------------+
|             America|       Place|
|       Pikas Now Pre|       Other|
|                Elon|      Person|
|       GretaThunberg|      Person|
|               Bible|       Other|
|          Revelation|       Other|
|             Florida|       Place|
|             Ice-Age|       Other|
|                   -|       Other|
|                IPCC|Organization|
|       MammothSteppe|       Place|
|Ice Road Truckers...|       Other|
|Don’t Call It Glo...|       Other|
|      Global Warming|       Other|
|               Greta|      Person|
|       GretaThunberg|      Person|
|            Libtards|       Other|
|         mumbo-jumbo|       Other|
|The Age of Aquari...|       Other|
|          Solar Myth|       Other|
+--------------------+------------+
only showing top 20 rows



### In Class Top 5 person mentions in tweets