In [1]:
from pyspark.sql import SparkSession

import findspark
findspark.init()

spark = SparkSession.builder.appName('Data Processing').getOrCreate()

##### **Loading the dataframe**

In [2]:
spark_df = spark.read.load(r'F:\GUVI_DATA_SCIENCE\Project\Energy-Consumption-Analysis\Datasets\Consolidated\merged_df.csv', format = 'csv', header = True, inferSchema = True)

In [3]:
pandas_df = spark_df.toPandas()

##### *Null values*

In [4]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173873 entries, 0 to 173872
Data columns (total 12 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Sector                                  173873 non-null  object 
 1   Sub Sector                              173866 non-null  object 
 2   Organization                            173866 non-null  object 
 3   Operation                               173866 non-null  object 
 4   Operation Type                          173854 non-null  object 
 5   Address                                 173840 non-null  object 
 6   City                                    173840 non-null  object 
 7   Postal Code                             173814 non-null  object 
 8   2011 GHG Emissions (kg)                 173823 non-null  float64
 9   2011 Energy Intensity (eWh/HDD/sq. ft)  157243 non-null  float64
 10  2011 Energy Intensity (ekWh/ML)         2419

In [5]:
pandas_df = pandas_df.drop('2011 Energy Intensity (ekWh/ML)', axis = 1)

In [6]:
pandas_df.shape

(173873, 11)

In [7]:
pandas_df = pandas_df.dropna()

In [8]:
pandas_df.isnull().sum()

Sector                                    0
Sub Sector                                0
Organization                              0
Operation                                 0
Operation Type                            0
Address                                   0
City                                      0
Postal Code                               0
2011 GHG Emissions (kg)                   0
2011 Energy Intensity (eWh/HDD/sq. ft)    0
year                                      0
dtype: int64

##### *Duplicated rows*

In [9]:
pandas_df.duplicated().sum()

11

In [10]:
pandas_df = pandas_df.drop_duplicates()

In [11]:
pandas_df.duplicated().sum()

0

##### *Renaming Columns*

In [17]:
pandas_df = pandas_df.rename(columns = {'2011 GHG Emissions (kg)': 'GHG Emissions(kg)', '2011 Energy Intensity (eWh/HDD/sq. ft)' : 'Energy Intensity(eWh/HDD/sq. ft)'})

In [20]:
pandas_df.sample()

Unnamed: 0,Sector,Sub Sector,Organization,Operation,Operation Type,Address,City,Postal Code,GHG Emissions(kg),Energy Intensity(eWh/HDD/sq. ft),year
32528,Municipal,Municipality,Township of Enniskillen,Enniskillen Community Centre,Community centres,3086 Main Street,Oil City,N0N 1N0,10198.949859,9.997987,2013.0


##### *Changing datatype of year column* 

In [21]:
pandas_df['year'] = pandas_df['year'].astype(int)

In [22]:
pandas_df['year'].info()

<class 'pandas.core.series.Series'>
Index: 157232 entries, 0 to 173872
Series name: year
Non-Null Count   Dtype
--------------   -----
157232 non-null  int32
dtypes: int32(1)
memory usage: 1.8 MB


##### *Saving to a Csv File*

In [23]:
pandas_df.to_csv(r'F:\GUVI_DATA_SCIENCE\Project\Energy-Consumption-Analysis\Datasets\Processed\processed_dataset.csv', index = False)