In [1]:
# Custom code to set path
import os
import sys
import time
from functools import reduce
from pathlib import Path
import pandas as pd

def add_module_path_to_system():
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)
    
    return module_path 
module_path = add_module_path_to_system()

In [2]:
# Defining list of US States abbreviated - pretty sure there are packages that can provide these 

usa_state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

### Reading CSV files from URL

#### Here is the first most used most impressive group of functions that pandas offers: 
read_ -> group of functions that start with read_ for e.g.: read_csv, read_excel, read_html, read_json, read_sql etc                    offer the ability to consume data from a lot of the popular data stores including but not limited to SAS, SQL, STATA,          Excel, HTML, JSON to name a few 

##### Note: When getting the link from Github, make sure to get it after selecting the "Raw" format

In [3]:
url = "https://raw.githubusercontent.com/UGuntupalli/pandas_demo/master/List_of_plants_for_solar_United_States_all_sectors.csv?token=ALNPDNLE6XXIGPY7NN2XFEK6Q6HNG" 
solar_plants_in_usa = pd.read_csv(url, index_col=0) 
print(solar_plants_in_usa.head(3)) # head() and tail() are helpful methods that let us preview large data frames with relative ease

Plant Code State                 Sector Name  \
Plant Name                                                                      
100 Brook Hill Drive Solar            63292    NY        Commercial non-cogen   
1025 Traveller Solar, LLC             62660    NC  Electric utility non-cogen   
1047 Little Mountain Solar, LLC       62661    NC  Electric utility non-cogen   

                                Prime Movers Fuel Types  
Plant Name                                               
100 Brook Hill Drive Solar                PV        SUN  
1025 Traveller Solar, LLC                 PV        SUN  
1047 Little Mountain Solar, LLC           PV        SUN  


### Exploring the data once it is in Pandas 

#### Pandas offers multiple handy functions that makes it easy to explore and understand the data. Let us take a look at some of them: 
     1. info - Provides both a summary and detailed column level break down of a data frame. You can toggle 
               the level of detail using the verbose argument 
     2. colums - Provides the names/ labels of the columns
     3. dtypes - Provides the data types of the columns
     4. shape - Returns a tuple representing the dimensionality of the DataFrame

##### Note: 
In verbose format, info method provides all the information that we are trying to extract using the remaining functions. The remaining functions have been added solely for informative and demonstrative purposes only

In [4]:
# Info method  
print('**************************************')
print('info method - high level')
print('**************************************')
print(solar_plants_in_usa.info(verbose=False)) # Get a summary of columns count and its data types but no deeper information 
print('**************************************')
print('info method - detailed')
print('**************************************')
print(solar_plants_in_usa.info(verbose=True)) 

# columns method
print('**************************************')
print('columns method')
print('**************************************')
print(solar_plants_in_usa.columns) 
# print(list(solar_plants_in_usa.columns)) - Alternatively print as a list, easier to visualize when there are a lot of columns

# dtypes method
print('**************************************')
print('dtypes method')
print('**************************************')
print(solar_plants_in_usa.dtypes) 

# shape method 
print('**************************************')
print('shape method')
print('**************************************')
print(solar_plants_in_usa.shape) 

**************************************
info method - high level
**************************************
<class 'pandas.core.frame.DataFrame'>
Index: 3288 entries, 100 Brook Hill Drive Solar to Zumbro Community Solar Garden
Columns: 5 entries, Plant Code to Fuel Types
dtypes: int64(1), object(4)
memory usage: 154.1+ KB
None
**************************************
info method - detailed
**************************************
<class 'pandas.core.frame.DataFrame'>
Index: 3288 entries, 100 Brook Hill Drive Solar to Zumbro Community Solar Garden
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Plant Code    3288 non-null   int64 
 1   State         3288 non-null   object
 2   Sector Name   3288 non-null   object
 3   Prime Movers  3288 non-null   object
 4   Fuel Types    3288 non-null   object
dtypes: int64(1), object(4)
memory usage: 154.1+ KB
None
**************************************
columns method
*******************

#### Inspecting and Exploring Numeric vs Character Data

#####  
Summarizing or getting a sense for numeric columns can be easily performed by plotting or looking at general statistics, while for character data understanding the unique values in the column and other exploration techniques are better suited. Please note that Plant Code is being analyzed for demonstrative purposes only and it does not convey any useful information

In [5]:
# Let us use the State column for character data exploration and Plant Code for numeric data 

# unique method
print('**************************************')
print('unique method')
print('**************************************')
states_with_solar_plants = solar_plants_in_usa['State'].unique()
print(states_with_solar_plants)
n_states = len(states_with_solar_plants)
print(f'''# of states with PV plants: {n_states} ''')

states_with_no_installations = set(usa_state_codes) - set(states_with_solar_plants)
print(f''' States with no PV Installations based on this data: {states_with_no_installations} ''')

# min method 
print('**************************************')
print('min method')
print('**************************************')
print(solar_plants_in_usa['Plant Code'].min())

# max method 
print('**************************************')
print('max method')
print('**************************************')
print(solar_plants_in_usa['Plant Code'].max())

**************************************
unique method
**************************************
['NY' 'NC' 'NJ' 'MA' 'CA' 'VT' 'PA' 'FL' 'HI' 'CO' 'CT' 'OH' 'AL' 'MD'
 'AZ' 'SC' 'MN' 'WA' 'OR' 'TX' 'NM' 'ID' 'NV' 'WI' 'GA' 'KS' 'VA' 'IN'
 'UT' 'MT' 'IA' 'RI' 'DE' 'MO' 'TN' 'NE' 'KY' 'OK' 'MI' 'IL' 'AR' 'MS'
 'ME' 'DC' 'LA' 'SD' 'WY']
# of states with PV plants: 47 
 States with no PV Installations based on this data: {'AK', 'WV', 'ND', 'NH'} 
**************************************
min method
**************************************
141
**************************************
max method
**************************************
63525


### Filtering Data & Aggregating 

####  
Filtering data helps address/analyze a lot of questions using the data at hand, while aggregating helps draw meaningful insights by segregating data into groups based on some unique property. 

#####   
Let us explore how many types of PV plants exist and which state has the most - No prizes for guessing - my background says California will be highest, but guessing second highest or third highest state might be harder :) 

In [6]:
# What is the state with the highest # of plants 
solar_plants_in_usa['State'].value_counts() # Woah - that was a surprise, I am assuming this is because of smaller plants, lets check 

NC    594
CA    583
MA    312
MN    262
NJ    240
NY    170
CO     84
MD     80
AZ     79
FL     71
IN     68
NM     66
TX     63
GA     62
SC     59
OR     56
VT     40
NV     38
CT     36
UT     31
OH     30
PA     29
VA     26
WI     22
HI     22
RI     20
TN     17
MO     17
MI     16
DE     10
IL     10
ID      9
IA      7
AR      7
OK      7
MS      7
KY      6
AL      6
NE      6
MT      6
KS      5
ME      2
DC      2
WA      2
WY      1
SD      1
LA      1
Name: State, dtype: int64

In [7]:
# Create a group-by object where we are grouping the data based on the type of generation 
group_by_type_of_generation = solar_plants_in_usa.groupby('Sector Name')
group_by_type_of_generation # As you can see - this only creates a grouped object. User needs to define further what kind of 
# data slicing is desired 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025CA6D639C8>

In [10]:
group_by_state_and_sector = solar_plants_in_usa.groupby(['State','Sector Name']).size().reset_index().rename(columns={0:'count'})
group_by_state_and_sector

Unnamed: 0,State,Sector Name,count
0,AL,Electric utility,2
1,AL,Electric utility non-cogen,4
2,AR,Commercial non-cogen,2
3,AR,Electric utility,1
4,AR,Electric utility non-cogen,4
...,...,...,...
116,WA,Electric utility,1
117,WA,Electric utility non-cogen,1
118,WI,Commercial non-cogen,2
119,WI,Electric utility non-cogen,20


In [11]:
retain_top_2 = group_by_state_and_sector[(group_by_state_and_sector['State'].isin(['CA','NC']))]
retain_top_2

Unnamed: 0,State,Sector Name,count
8,CA,Commercial cogen,2
9,CA,Commercial non-cogen,38
10,CA,Electric utility,46
11,CA,Electric utility cogen,3
12,CA,Electric utility non-cogen,487
13,CA,Industrial cogen,1
14,CA,Industrial non-cogen,6
68,NC,Commercial non-cogen,4
69,NC,Electric utility,12
70,NC,Electric utility non-cogen,578
