# Analysis of Global Warming Tweets on January 2023.
This proejct is based on global warming tweets tweeted on January 2023 and stored in parquet format. I have added emotion (anger, joy, opotimism and sadness) and gender at the end of each tweets. Emotion analysis was performed using a pre-trained model from Hugging Face (Twitter-roBERTa-base for Emotion Recognition). This is a roBERTa-base model trained on ~58M tweets and finetuned for emotion recognition with the TweetEval benchmark. Each tweet is classified into four emotions (joy, optimism, anger, and sadness) with a confidence score. In addition, gender is extracted based on first name of user account if a user account has a real first name and the gender can be identified by python package gender guesser. 

# Read tweets into a data frame

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from helper_functions import displayByGroup
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, explode, desc

# check if the Spark session is active. If it is activate, close it

try:
    if spark:
        spark.stop()
except:
    pass    

spark = (SparkSession.builder.appName("Global Warming Tweets Analysis")
        .config("spark.port.maxRetries", "100")
        .config("spark.sql.mapKeyDedupPolicy", "LAST_WIN")  # This configuration allow the duplicate keys in the map data type.
#        .config("spark.driver.memory", "16g")
        .getOrCreate())

# confiture the log level (defaulty is WWARN)
spark.sparkContext.setLogLevel('ERROR')

# read the global warming tweets

df=spark.read.parquet('/opt/shared/globalwarming_202301')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/16 07:47:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/16 07:47:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/10/16 07:47:45 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/10/16 07:47:45 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/10/16 07:47:45 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
25/10/16 07:47:45 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
25/10/16 07:47:45 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
25/10/16 07:47:45 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.
25/10/16 07:47:45 WARN Utils: Serv

# In class exercise

## Extract mentions of each Twitter user has

In [3]:
df.columns

['id',
 '__twarc',
 'attachments',
 'author',
 'author_id',
 'context_annotations',
 'conversation_id',
 'created_at',
 'edit_history_tweet_ids',
 'entities',
 'geo',
 'in_reply_to_user',
 'in_reply_to_user_id',
 'lang',
 'possibly_sensitive',
 'public_metrics',
 'referenced_tweets',
 'reply_settings',
 'text',
 'withheld',
 'anger',
 'joy',
 'optimism',
 'sadness',
 'first_name',
 'gender',
 'year']

In [14]:
df.select('author').printSchema()

root
 |-- author: struct (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- entities: struct (nullable = true)
 |    |    |-- description: struct (nullable = true)
 |    |    |    |-- cashtags: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |    |-- tag: string (nullable = true)
 |    |    |    |-- hashtags: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |    |-- tag: string (nullable = true)
 |    |    |    |-- mentions: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- end: long (nullable = true)

In [15]:
df.select('author.entities.description.mentions').printSchema()

root
 |-- mentions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- end: long (nullable = true)
 |    |    |-- start: long (nullable = true)
 |    |    |-- username: string (nullable = true)



In [4]:
df1=df.select('author.username', col('author.entities.description.mentions.username').alias('mentions')).filter(F.col('mentions').isNotNull())

df1.show()

+---------------+--------------------+
|       username|            mentions|
+---------------+--------------------+
|  SueMasonBurns|   [jayjayjjetplane]|
|     thebeachll|        [DAlSlESSSS]|
|brucecharityorg|       [BrucewilzUg]|
|   gabi_spilker|[EXCInequality, U...|
|       tribblez|          [theauxdj]|
| JavidGillaniMH|     [muslimhandspk]|
|   AngryGustavo|      [Polandballbr]|
|  dwallacewells|[nytopinion, NYTmag]|
| WillKochtitzky|         [unetweets]|
|      AbeeraMR_|          [sosart90]|
|          Tyrnn|     [BoomerExpress]|
|   sapnadeora93|[digvijaya_28, Ka...|
|  JacobsOSeaman|      [DailyMonitor]|
|DiamondGBraxton|[bestmicrofic, fo...|
|   Jane_Munroe_|      [jane_munroe_]|
|       trumphop|             [pbump]|
|    SeaPupEllie|   [Nina_once_again]|
|    nj_morrison|[AFP, EUDataNewsH...|
|       mmm_soup|           [dvd_mcf]|
|     PtlbSchool|[PTLBSchools, Str...|
+---------------+--------------------+
only showing top 20 rows



In [5]:
df2=df1.select('username', explode('mentions').alias('mentions'))

df2.orderBy('username').show()



+-----------+---------------+
|   username|       mentions|
+-----------+---------------+
|   00Meeach|           USMC|
|   0912abhi|  MmmutOfficial|
| 0Cuculainn|           jack|
|  0HG0D0HN0|    domina_jinx|
|   0mega001|     KaCyberApp|
|0rvicBuilds|    TheDailyBxB|
| 0xGamer030|      Solar_Dex|
| 0xGamer030|      Solar_Dex|
|      0xLMC|TsunamiFinance_|
|      0xLMC| thewolvesgroup|
|      0xLMC|  emptyholdings|
|   0xMojojo|      Pak_Gregg|
|   0xMojojo|       jayefunk|
|   0xMojojo|     Goblintown|
|    0xPetra|         zkMaps|
|   0xVodnik|  cryptotitvags|
|   0xVodnik| cryptobirbsnft|
|   0xVodnik|    shroomionft|
|      0xak_|       stanford|
|      0xak_|     cyberkongz|
+-----------+---------------+
only showing top 20 rows



                                                                                

### Top 5 users who mentioned most people

In [6]:
df2.groupBy('username').count().orderBy(desc('count')).limit(5).show()



+---------------+-----+
|       username|count|
+---------------+-----+
| IMPraveenDalal| 1128|
|          _PTLB|  678|
| _DigitalPolice|  410|
|  FurusetGerden|  308|
|DisasterReliefs|  306|
+---------------+-----+



                                                                                

### Top 5 users who received most mentions

In [7]:
df2.groupBy('mentions').count().orderBy(desc('count')).show(5)



+-------------+-----+
|     mentions|count|
+-------------+-----+
|    P4LOIndia| 1033|
|         PTLB|  730|
| PTLBProjects|  619|
|      TeleLaw|  488|
|WeMeanToClean|  360|
+-------------+-----+
only showing top 5 rows



                                                                                

## extract user location from tweets

In [8]:
df.select('author.location').show(10, False)

+------------------------------+
|location                      |
+------------------------------+
|Manchester, England           |
|Down very long track.         |
|NULL                          |
|Santa Ana, California         |
|NULL                          |
|Morgantown, WV                |
|NULL                          |
|NULL                          |
|U.S.A.                        |
|LORD HIS EXCELLENCY JAMES HRMH|
+------------------------------+
only showing top 10 rows



### Display Top 5 user location

In [9]:
result1=df.select('author.name', 'author.location').distinct()

result1.groupBy('location').count().orderBy(desc('count')).limit(5).show()



+---------------+-----+
|       location|count|
+---------------+-----+
|           NULL|74798|
|  United States| 2554|
|            USA| 1330|
|         Canada|  881|
|London, England|  830|
+---------------+-----+



                                                                                

In [10]:
result1.filter(col('location').isNotNull()).groupBy('location').count().orderBy(desc('count')).limit(5).show()



+---------------+-----+
|       location|count|
+---------------+-----+
|  United States| 2554|
|            USA| 1330|
|         Canada|  881|
|London, England|  830|
|   Florida, USA|  805|
+---------------+-----+



                                                                                

## Extract hashtags from tweets

In [23]:
df.select('entities.hashtags.tag').show()

+--------------------+
|                 tag|
+--------------------+
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|[GretaThunberg, G...|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|[climate, change,...|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
|                NULL|
+--------------------+
only showing top 20 rows



## Extract Entity (place, person, organizations) from tweets

In [11]:
df.select('entities.annotations.normalized_text', 'entities.annotations.type').show()

+--------------------+--------------+
|     normalized_text|          type|
+--------------------+--------------+
|                NULL|          NULL|
|                NULL|          NULL|
|[America, Pikas N...|[Place, Other]|
|              [Elon]|      [Person]|
|                NULL|          NULL|
|                NULL|          NULL|
|     [GretaThunberg]|      [Person]|
| [Bible, Revelation]|[Other, Other]|
|           [Florida]|       [Place]|
|        [Ice-Age, -]|[Other, Other]|
|                NULL|          NULL|
|              [IPCC]|[Organization]|
|     [MammothSteppe]|       [Place]|
|                NULL|          NULL|
|                NULL|          NULL|
|[Ice Road Trucker...|[Other, Other]|
|                NULL|          NULL|
|    [Global Warming]|       [Other]|
|                NULL|          NULL|
|                NULL|          NULL|
+--------------------+--------------+
only showing top 20 rows



In [8]:
#create a map from key value pair

df1=df.select(F.map_from_arrays('entities.annotations.type', 'entities.annotations.normalized_text').alias('entities'))

# explode key value pair

df2=df1.select(explode('entities'))

df2.show()

+------------+--------------------+
|         key|               value|
+------------+--------------------+
|       Place|             America|
|       Other|       Pikas Now Pre|
|      Person|                Elon|
|      Person|       GretaThunberg|
|       Other|          Revelation|
|       Place|             Florida|
|       Other|                   -|
|Organization|                IPCC|
|       Place|       MammothSteppe|
|       Other|Don’t Call It Glo...|
|       Other|      Global Warming|
|      Person|               Greta|
|      Person|       GretaThunberg|
|       Other|            Libtards|
|       Other|         mumbo-jumbo|
|       Other|          Solar Myth|
|       Other|                 CO2|
|      Person|                 Ben|
|Organization|Bureau of Meteoro...|
|       Place|              Europe|
+------------+--------------------+
only showing top 20 rows



### Top 5 person mentions in tweets

In [9]:
df2.filter(col('key')=='Person').groupBy('value').count().orderBy(desc('count')).show(5)



+------------+-----+
|       value|count|
+------------+-----+
|Paul Ehrlich| 4624|
|     Al Gore| 2449|
|  John Kerry| 1836|
|   IanPlimer| 1818|
|  Bill Gates| 1668|
+------------+-----+
only showing top 5 rows



                                                                                