# Task 1:
<b>Objective : The objective is to perform a country-wise analysis of emissions spanning all sectors. This analysis aims to pinpoint   the primary sectors contributing to emissions globally, along with their associated gases.</b>

<b> `Initiating Spark Session`</b>

#### <font color="green"> Spark application is running in standalone mode on the local machine</font>

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\spark-3.2.4-bin-hadoop2.7\\spark-3.2.4-bin-hadoop2.7'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Check_User").getOrCreate()

# Get the user running the Spark job
user = spark.sparkContext.sparkUser()
print("Spark job user:", user)


Spark job user: Surya


In [3]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Get_Application_ID").getOrCreate()

# Get the Spark application ID
app_id = spark.sparkContext.applicationId
print("Spark Application ID:", app_id)



Spark Application ID: local-1701873538400


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

spark_master = spark._sc._conf.get("spark.master")
print("Spark Master:", spark_master)


Spark Master: local[*]


<b> Files are stored in local HDFS </b>

In [5]:
import subprocess

# Hadoop project data directory
hadoop_directory = "/user/username/project_data/Data"

# Run the 'hadoop fs -ls' command using subprocess
try:
    ls_command = f"hadoop fs -ls {hadoop_directory}"
    output = subprocess.check_output(ls_command, shell=True).decode('utf-8')
    print(output)
except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")


                       note: please use "yarn jar" to launch
                             YARN applications, not this command.
Found 9 items
-rw-r--r--   1 Surya supergroup       1158 2023-11-29 06:07 /user/username/project_data/Data/HDFS Commands.txt
drwxr-xr-x   - Surya supergroup          0 2023-11-29 06:12 /user/username/project_data/Data/agriculture
drwxr-xr-x   - Surya supergroup          0 2023-11-29 06:12 /user/username/project_data/Data/buildings
drwxr-xr-x   - Surya supergroup          0 2023-11-29 06:12 /user/username/project_data/Data/fluorinated_gases
drwxr-xr-x   - Surya supergroup          0 2023-11-29 06:12 /user/username/project_data/Data/fossil_fuel_operations
drwxr-xr-x   - Surya supergroup          0 2023-12-02 01:32 /user/username/project_data/Data/manufacturing
drwxr-xr-x   - Surya supergroup          0 2023-11-29 06:12 /user/username/project_data/Data/mineral_extraction
drwxr-xr-x   - Surya supergroup          0 2023-12-02 01:33 /user/username/project_d

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType

import os

# Create a Spark session
spark = SparkSession.builder.appName("Task 1") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

<b> Filtering for files starting with name Country_ and storing their respective file paths </b>

In [7]:
# Define the schema for the CSV files
schema = StructType([
    StructField("iso3_country", StringType(), True),
    StructField("start_time", TimestampType(), True),
    StructField("end_time", TimestampType(), True),
    StructField("original_inventory_sector", StringType(), True),
    StructField("gas", StringType(), True),
    StructField("emissions_quantity", DoubleType(), True),
    StructField("emissions_quantity_units", StringType(), True),
    StructField("temporal_granularity", StringType(), True),
    StructField("created_date", StringType(), True),
    StructField("modified_date", StringType(), True)
])


# Define the HDFS path
hdfs_path = "hdfs://localhost:9000/user/username/project_data/Data/"


# List of folders to explore 
folders = ["agriculture", "buildings", "fluorinated_gases", "fossil_fuel_operations", 
           "manufacturing", "mineral_extraction", "power", "waste"]

file_paths = []  # Initialize list to store file paths


for folder in folders:
    folder_path = os.path.join(hdfs_path, folder)
    files = spark.read.format("csv").schema(schema).load(folder_path)
    
    # Extract filename from the file path using input_file_name()
    files_with_path = files.withColumn("filename", input_file_name())
    
    # Filter files with filenames starting with "country_"
    country_files = files_with_path.filter(files_with_path["filename"].contains("country_"))
    
    # Collect the paths of the identified files
    country_paths = country_files.select(input_file_name()).distinct().rdd.flatMap(lambda x: x).collect()
    
    # Append the paths to the file_paths list
    file_paths.extend(country_paths)


In [8]:
len(file_paths)

31

In [9]:
file_paths

['hdfs://localhost:9000/user/username/project_data/Data/agriculture/country_manure-management_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/agriculture/country_synthetic-fertilizer-application_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/agriculture/country_enteric-fermentation_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/agriculture/country_cropland-fires_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/agriculture/country_rice-cultivation_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/agriculture/country_other-agricultural-soil-emissions_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/buildings/country_residential-and-commercial-onsite-fuel-usage_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/buildings/country_other-onsite-fuel-usage_emissions.csv',
 'hdfs://localhost:9000/user/username/project_data/Data/fluor

#### Creating a merged dataframe

In [10]:
from functools import reduce
from pyspark.sql import DataFrame

#selected_columns = ["iso3_country", "original_inventory_sector", "gas", "emissions_quantity"]

# Read and merge the identified CSV files into a single DataFrame with selected columns
dataframes = []  

for file_path in file_paths:
    
    df_chunks = spark.read.format("csv").schema(schema).option("header", "true").load(file_paths)
        
    # Select desired columns and handle missing values
    #selected_df = df_chunks.select(selected_columns)

    #selected_df.show()
        
    # Append the processed DataFrame to the list
    dataframes.append(df_chunks)   
        
# Merge all DataFrames into a single DataFrame
merged_df = reduce(DataFrame.unionAll, dataframes)

# Show the merged DataFrame
merged_df.show()

+------------+-------------------+-------------------+-------------------------+---------+------------------+------------------------+--------------------+--------------------+--------------------+
|iso3_country|         start_time|           end_time|original_inventory_sector|      gas|emissions_quantity|emissions_quantity_units|temporal_granularity|        created_date|       modified_date|
+------------+-------------------+-------------------+-------------------------+---------+------------------+------------------------+--------------------+--------------------+--------------------+
|         MNE|2020-01-01 00:00:00|2020-12-31 00:00:00|     residential-and-c...|      co2|              null|                  tonnes|                null|2022-09-06 12:39:...|                null|
|         AND|2020-01-01 00:00:00|2020-12-31 00:00:00|     residential-and-c...|      co2|              null|                  tonnes|                null|2022-09-06 12:39:...|                null|
|         

In [11]:
merged_df.count()

8509810

In [12]:
cleaned_df = merged_df.dropna(subset=["emissions_quantity"])
cleaned_df.count()

7578663

<b> Mapping the subsectors with their respective sectors </b>

In [13]:
from pyspark.sql.functions import col, when

#distinct_sectors = cleaned_df.select("original_inventory_sector").distinct()
#distinct_sectors.show()

sector_mapping = {
    "incineration-and-open-burning-of-waste": "Waste",
    "other-fossil-fuel-operations": "Fossil Fuels",
    "residential-and-commercial-onsite-fuel-usage": "Buildings",
    "wastewater-treatment-and-discharge": "Waste",
    "cropland-fires": "Agriculture",
    "biological-treatment-of-solid-waste-&-biogenic": "Waste",
    "oil-and-gas-production-and-transport": "Fossil Fuels",
    "other-manufacturing": "Manufacturing",
    "other-onsite-fuel-usage": "Buildings",
    "other-energy-use": "Power",
    "electricity-generation": "Power",
    "solid-fuel-transformation": "Fossil Fuels",
    "solid-waste-disposal": "Waste",
    "synthetic-fertilizer-application": "Agriculture",
    "other-agricultural-soil-emissions": "Agriculture",
    "oil-and-gas-refining": "Fossil Fuels",
    "enteric-fermentation": "Agriculture",
    "rice-cultivation": "Agriculture",
    "coal-mining": "Fossil Fuels",
    "cement": "Manufacturing",
    "steel": "Manufacturing",
    "manure-management": "Agriculture",
    "pulp-and-paper": "Manufacturing",
    "fluorinated-gases": "Fluorinated Gas",
    "rock-quarrying": "Mineral Extraction",
    "bauxite-mining": "Mineral Extraction",
    "sand-quarrying": "Mineral Extraction",
    "copper-mining": "Mineral Extraction",
    "aluminum": "Manufacturing",
    "chemicals": "Manufacturing",
    "iron-mining": "Mineral Extraction"
}


In [14]:
from pyspark.sql.functions import expr

# Construct the SQL expression for mapping original_inventory_sector to corresponding sector
sql_expr = "CASE "
for sector, mapping in sector_mapping.items():
    sql_expr += f"WHEN original_inventory_sector = '{sector}' THEN '{mapping}' "

# Add the 'Other' condition at the end
sql_expr += "ELSE 'Other' END"

# Apply the SQL expression to create the 'sector' column
cleaned_df = cleaned_df.withColumn("sector", expr(sql_expr))

cleaned_df.show(5)

+------------+-------------------+-------------------+-------------------------+---+------------------+------------------------+--------------------+--------------------+--------------------+---------+
|iso3_country|         start_time|           end_time|original_inventory_sector|gas|emissions_quantity|emissions_quantity_units|temporal_granularity|        created_date|       modified_date|   sector|
+------------+-------------------+-------------------+-------------------------+---+------------------+------------------------+--------------------+--------------------+--------------------+---------+
|         SHN|2019-01-01 00:00:00|2019-12-31 00:00:00|     residential-and-c...|n2o|           7.41E-5|                  tonnes|                null|2022-09-06 12:39:...|2022-10-05 21:26:...|Buildings|
|         ABW|2018-01-01 00:00:00|2018-12-31 00:00:00|     residential-and-c...|co2|       154132.0457|                  tonnes|                null|2022-09-06 12:39:...|2022-10-05 21:26:...|B

In [15]:
cleaned_df.printSchema()

root
 |-- iso3_country: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- original_inventory_sector: string (nullable = true)
 |-- gas: string (nullable = true)
 |-- emissions_quantity: double (nullable = true)
 |-- emissions_quantity_units: string (nullable = true)
 |-- temporal_granularity: string (nullable = true)
 |-- created_date: string (nullable = true)
 |-- modified_date: string (nullable = true)
 |-- sector: string (nullable = false)



<b>Processing supporting file  to get the country name from the iso3_codes</b>

In [17]:
# Define the HDFS path to your CSV file
hdfs_path = "hdfs://localhost:9000/user/username/project_data/countries/countries_codes_and_coordinates.csv"

# Read the CSV file into a DataFrame
country_df = spark.read.format("csv").option("header", "true").load(hdfs_path)

country_df.show(5)

+--------------+------------+------------+------------+------------------+-------------------+
|       Country|Alpha-2 code|Alpha-3 code|Numeric code|Latitude (average)|Longitude (average)|
+--------------+------------+------------+------------+------------------+-------------------+
|   Afghanistan|        "AF"|       "AFG"|         "4"|              "33"|               "65"|
|       Albania|        "AL"|       "ALB"|         "8"|              "41"|               "20"|
|       Algeria|        "DZ"|       "DZA"|        "12"|              "28"|                "3"|
|American Samoa|        "AS"|       "ASM"|        "16"|        "-14.3333"|             "-170"|
|       Andorra|        "AD"|       "AND"|        "20"|            "42.5"|              "1.6"|
+--------------+------------+------------+------------+------------------+-------------------+
only showing top 5 rows



In [18]:
country_df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Alpha-2 code: string (nullable = true)
 |-- Alpha-3 code: string (nullable = true)
 |-- Numeric code: string (nullable = true)
 |-- Latitude (average): string (nullable = true)
 |-- Longitude (average): string (nullable = true)



In [19]:
country_df.filter(col("Alpha-3 code").isNull()).count()

0

In [20]:
cleaned_df.filter(col("iso3_country").isNull()).count()

0

In [21]:
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace

#cleaning the data

country_df = country_df.withColumn("Alpha-3 code", regexp_replace(col("Alpha-3 code"), '"', ''))

#country_df.show()

In [22]:
differences = cleaned_df.select("iso3_country").exceptAll(country_df.select("Alpha-3 code"))

<b> Joining both the dataframes </b>

In [23]:
from pyspark.sql.functions import lower, trim


# Rename columns in country_df for simplicity
country_df = country_df.withColumnRenamed("Latitude (average)", "Latitude") \
    .withColumnRenamed("Longitude (average)", "Longitude")


# Convert columns to lowercase for consistency in the join condition
cleaned_df = cleaned_df.withColumn("iso3_country_lower", lower(cleaned_df["iso3_country"]))
country_df = country_df.withColumn("Alpha-3 code_lower", lower(country_df["Alpha-3 code"]))
 
joined_df1 = cleaned_df.join(country_df, trim(cleaned_df["iso3_country_lower"]) == trim(country_df["Alpha-3 code_lower"]), how="inner")

In [24]:
columns_to_drop = ['iso3_country_lower', 'Alpha-2 code', 'Alpha-3 code', 'Numeric code', 'Alpha-3 code_lower']

joined_df1 = joined_df1.drop(*columns_to_drop)

In [25]:
joined_df1.show(10)

+------------+-------------------+-------------------+-------------------------+---+------------------+------------------------+--------------------+--------------------+--------------------+---------+--------------------+-----------+-----------+
|iso3_country|         start_time|           end_time|original_inventory_sector|gas|emissions_quantity|emissions_quantity_units|temporal_granularity|        created_date|       modified_date|   sector|             Country|   Latitude|  Longitude|
+------------+-------------------+-------------------+-------------------------+---+------------------+------------------------+--------------------+--------------------+--------------------+---------+--------------------+-----------+-----------+
|         SHN|2019-01-01 00:00:00|2019-12-31 00:00:00|     residential-and-c...|n2o|           7.41E-5|                  tonnes|                null|2022-09-06 12:39:...|2022-10-05 21:26:...|Buildings|Saint Helena, Asc...| "-15.9333"|     "-5.7"|
|         AB

In [27]:
joined_df1 = joined_df1.withColumn("Latitude", regexp_replace(col("Latitude"), '"', ''))

In [28]:
Task1 = joined_df1.withColumn("Longitude", regexp_replace(col("Longitude"), '"', ''))

In [29]:
Task1.show()

+------------+-------------------+-------------------+-------------------------+---+--------------------+------------------------+--------------------+--------------------+--------------------+---------+--------------------+---------+---------+
|iso3_country|         start_time|           end_time|original_inventory_sector|gas|  emissions_quantity|emissions_quantity_units|temporal_granularity|        created_date|       modified_date|   sector|             Country| Latitude|Longitude|
+------------+-------------------+-------------------+-------------------------+---+--------------------+------------------------+--------------------+--------------------+--------------------+---------+--------------------+---------+---------+
|         SHN|2019-01-01 00:00:00|2019-12-31 00:00:00|     residential-and-c...|n2o|             7.41E-5|                  tonnes|                null|2022-09-06 12:39:...|2022-10-05 21:26:...|Buildings|Saint Helena, Asc...| -15.9333|     -5.7|
|         ABW|2018-0

In [30]:
columns_to_drop = ['temporal_granularity', 'created_date', 'modified_date']

Task1 = Task1.drop(*columns_to_drop)

### Saving the output file

<b> Storing the parts of the output file (247 parts) in HDFS </b>

In [None]:
# Writing in Parquet format
Task1.write.format("parquet").mode("ignore").save("hdfs://localhost:9000/user/username/project_data/output_parquet")

# Writing in CSV format
Task1.write.format("csv").mode("ignore").save("hdfs://localhost:9000/user/username/project_data/output_csv")


In [None]:
# Storing as a single csv file in the local drive
num_files = 1  # Number of output files

Task1.coalesce(num_files) \
    .write \
    .mode("append") \
    .option("header", "true") \
    .csv("F:/project/Data/task1_output1_csv")

In [32]:
spark.stop()