In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #data visulaization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Task Details**
1. Load the data from the CSV files
2. Explore each dataset - columns, counts, basic stats
3. Understand the domain context and explore underlying patterns in the data
4. Explore the data and try to answer questions like -
5. What is the mean value of daily yield?
6. What is the total irradiation per day?
7. What is the max ambient and module temperature?
8. How many inverters are there for each plant?
9. What is the maximum/minimum amount of DC/AC Power generated in a time interval/day?
10. Which inverter (source_key) has produced maximum DC/AC power?
11. Rank the inverters based on the DC/AC power they produce
12. Is there any missing data?
13. You might have to pre-process the data to allow for some of the analysis (hint: date and time)

Expected Submission
Submit your notebook with all your work.
Evaluation
You will be evaluated on how detailed and comprehensive is your exploration.

# COMMOM FUNCTIONS 

In [None]:
#USED TO CLEAR THE OUTPUT KERNEL
def clear():
    print("\n"*50)

    
#CREATES A HATCH BLOCK TO MAKE OUTPUT READABLE    
def create_block():
    print("\n\n\n")
    print("#"*100)
    print("\n\n\n")

# 1.LOADING DATA FROM .csv FILES

In [None]:
p1_gen = pd.read_csv("../input/solar-power-generation-data/Plant_1_Generation_Data.csv")
p1_wet = pd.read_csv("../input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv")

# MULTIPURPOSE FUNCTIONS

In [None]:
def unique_cols(df, cols):
    
    l = df[cols].unique()
    
    print("There are total",len(l),"unique",cols,"\n")
    print("#"*100,"\n")

    for i in range(len(l)):
        print(cols,"NUMBER",i+1,":-  ", l[i] )
        
###############################################################################################################################################################################        
###############################################################################################################################################################################  
###############################################################################################################################################################################  


def column_list_creator(df, DS_name):
    col_list = df.columns                            #STORING COLUMNS IN LIST DATATYPE
    print("########## Columns In DataSet", DS_name, "##########\n")   #PRINTING HEADER FOR COLUMNS
    
    for i in range(len(col_list)):
        print("#",i+1,"...",col_list[i])        #PRINTING COLUMNS THROUGH ITERATION
        
    print("\n"*5)                          #CLEARING LITTLE SPACE FOR NEXT OUTPUT

    
    
###############################################################################################################################################################################  
###############################################################################################################################################################################  
###############################################################################################################################################################################  

def basic_stats(df, col):
    print("#####-----BASIC STATISTICS ON",col,"-----#####\n\n")
    print("MAXIMUM ",col," IS:- ", df[col].max())
    print("MINIMUM ",col," IS:- ", df[col].min())
    print("AVERAGE ",col," IS:- ", df[col].mean())
    create_block()



###############################################################################################################################################################################  
###############################################################################################################################################################################  
###############################################################################################################################################################################  
        

def unique_data(df, cols):
    print("ÜNIQUE VALUES OCCURENCE IN COLUMN", cols, ":-")
    print(df[cols].value_counts())


# 2. EXPLORING DATASETS

#GETTING ALL COLUMNS OF BOTH FILES FOR PLANT 1

In [None]:
column_list_creator(p1_gen, "PLANT_1_GENERATION_DATA")
column_list_creator(p1_wet, "PLANT_1_WEATHER_SENSOR_DATA")


# GETTING UNIQUE INVERTERS

In [None]:
unique_cols(p1_gen, "SOURCE_KEY")

# BASIC STATS.. MEAN, MAX, MIN FOR DIFFERENT COLUMNS

In [None]:
basic_stats(p1_gen,'TOTAL_YIELD')
basic_stats(p1_gen, 'DAILY_YIELD')
basic_stats(p1_gen, 'DC_POWER')
basic_stats(p1_gen, 'AC_POWER')
basic_stats(p1_wet, 'AMBIENT_TEMPERATURE')
basic_stats(p1_wet, 'MODULE_TEMPERATURE')
basic_stats(p1_wet, 'IRRADIATION')


# GETTING UNIQUE DATA COUNTS

In [None]:
unique_data(p1_gen, "SOURCE_KEY")

# 5.What is the mean value of daily yield?

In [None]:
print("MEAN VALUE OF DAILY YIELD IS:-  ",p1_gen["DAILY_YIELD"].mean())

# CREATING DATE, TIME,HOURS & MINUTES COLUMN SEPARATELY

In [None]:
#CREATING DATE AND TIME COLUMNS SEPARATELY FOR WEATHER DATA

p1_wet['DATE_TIME'] = pd.to_datetime(p1_wet['DATE_TIME'],format = '%Y-%m-%d %H:%M:%S')

p1_wet['DATE'] = p1_wet['DATE_TIME'].apply(lambda x:x.date())
p1_wet['TIME'] = p1_wet['DATE_TIME'].apply(lambda x:x.time())

p1_wet['HOUR'] = pd.to_datetime(p1_wet['TIME'],format='%H:%M:%S').dt.hour
p1_wet['MINUTES'] = pd.to_datetime(p1_wet['TIME'],format='%H:%M:%S').dt.minute



#CREATING DATE AND TIME COLUMNS SEPARATELY FOR GENERATION DATA

p1_gen['DATE_TIME'] = pd.to_datetime(p1_gen['DATE_TIME'],format = '%d-%m-%Y %H:%M')

p1_gen['DATE'] = p1_gen['DATE_TIME'].apply(lambda x:x.date())
p1_gen['TIME'] = p1_gen['DATE_TIME'].apply(lambda x:x.time())

p1_gen['HOUR'] = pd.to_datetime(p1_gen['TIME'],format='%H:%M:%S').dt.hour
p1_gen['MINUTES'] = pd.to_datetime(p1_gen['TIME'],format='%H:%M:%S').dt.minute



#GETING NEW COLUMN DATA
p1_gen.info()
print("\n"*5)
p1_wet.info()

# 6.What is the total irradiation per day?

In [None]:
p1_wet.groupby('DATE')['IRRADIATION'].sum()

In [None]:



#PLOTTING GRAPH
irddiation_sum = p1_wet.groupby('DATE')['IRRADIATION'].sum()    

unique_date = p1_wet["DATE"].unique()


fig= plt.figure(figsize=(16,9))

axes= fig.add_axes([0.1,0.1,0.8,0.8])

axes.plot(unique_date, irddiation_sum)

plt.xlabel("DATE")
plt.ylabel("TOTAL IRRADIATION")
plt.title("TOTAL IRRADIATION PER DAY")
plt.show()





# 7.MAXIMUM AMBIENT & MODULE TEMPERATURE

In [None]:
print("MAXIMUM AMBIENT TEMPERATURE IS\n", p1_wet["AMBIENT_TEMPERATURE"].max())
create_block()
print("MAXIMUM MODULE TEMPERATURE IS\n", p1_wet["MODULE_TEMPERATURE"].max())

# 8.HOW MANY INVERTERS ARE THERE FOR EACH PLANT

In [None]:
l = p1_gen['PLANT_ID'].unique() # creates a list of all unique plant id
    


for i in l:
    z = p1_gen[p1_gen['PLANT_ID']== i]['SOURCE_KEY'].unique()
    print("NUMBER OF INVERTERS FOR PLANT ", i, "ARE", len(z),":-\n\n")
    
    for j in z:
        print(j)

# 9.What is the maximum/minimum amount of DC/AC Power generated in a time day?

MAXIMUM DC POWER vs MINIMUM DC POWER DATA

In [None]:
print("MAXIMUM DC POWER IN A DAY IS:-\n ", p1_gen.groupby('DATE')['DC_POWER'].max())
create_block()
print("MINIMUM DC POWER IN A DAY IS:-\n ", p1_gen.groupby('DATE')['DC_POWER'].min())





MAXIMUM DC POWER vs MINIMUM DC POWER GRAPH

In [None]:
#PLOTTING GRAPH
max_dc_per_day =  p1_gen.groupby('DATE')['DC_POWER'].max()

min_dc_per_day =  p1_gen.groupby('DATE')['DC_POWER'].min()

unique_date = p1_wet["DATE"].unique()


fig= plt.figure(figsize=(16,9))

axes= fig.add_axes([0.1,0.1,0.8,0.8])

axes.plot(unique_date, max_dc_per_day, color='r', label="MAX DC OUTPUT")
axes.plot(unique_date, min_dc_per_day, color='b', label="MIN DC OUTPUT")

plt.xlabel("DATE")
plt.ylabel("DC POWER")
plt.title("MAX v/s MIN DC POWER PER DAY")

plt.legend(loc="upper left")

plt.show()

MAXIMUM AC POWER VS MINIMUM AC POWER

In [None]:
print("MAXIMUM AC POWER IN A DAY IS:-\n ", p1_gen.groupby('DATE')['AC_POWER'].max())
create_block()
print("MINIMUM AC POWER IN A DAY IS:-\n ", p1_gen.groupby('DATE')['AC_POWER'].min())


GRAPH FOR MAX AC POWER VS MIN AC POWER

In [None]:
#PLOTTING GRAPH
max_ac_per_day =  p1_gen.groupby('DATE')['AC_POWER'].max()

min_ac_per_day =  p1_gen.groupby('DATE')['AC_POWER'].min()

unique_date = p1_wet["DATE"].unique()


fig= plt.figure(figsize=(16,9))

axes= fig.add_axes([0.1,0.1,0.8,0.8])

plt.xlabel("DATE")
plt.ylabel("AC POWER")
plt.title("MAX v/s MIN AC POWER PER DAY")

axes.plot(unique_date, max_ac_per_day, color='r', label="MAXIMUM AC OUTPUT")
axes.plot(unique_date, min_ac_per_day, color='b', label="MINIMUM AC OUTPUT")

plt.legend(loc="upper left")

plt.show()

MAXIMUM AC v/s DC Output per day

In [None]:
max_ac_per_day =  p1_gen.groupby('DATE')['AC_POWER'].max()

min_dc_per_day =  p1_gen.groupby('DATE')['DC_POWER'].max()

unique_date = p1_wet["DATE"].unique()


fig= plt.figure(figsize=(16,9))

axes= fig.add_axes([0.1,0.1,0.8,0.8])

plt.xlabel("DATE")
plt.ylabel("OUTPUT POWER")
plt.title("MAX DC v/s AC POWER PER DAY")

axes.plot(unique_date, max_ac_per_day, color='r', label="MAXIMUM AC OUTPUT")
axes.plot(unique_date, min_dc_per_day, color='b', label="MAXIMUM DC OUTPUT")

plt.legend(loc="upper left")

plt.show()

Average DC v/s AC output per day

In [None]:
avg_ac_per_day =  p1_gen.groupby('DATE')['AC_POWER'].mean()

avg_dc_per_day =  p1_gen.groupby('DATE')['DC_POWER'].mean()

unique_date = p1_wet["DATE"].unique()


fig= plt.figure(figsize=(16,9))

axes= fig.add_axes([0.1,0.1,0.8,0.8])

plt.xlabel("DATE")
plt.ylabel("OUTPUT POWER")
plt.title("AVERAGE DC v/s AC POWER PER DAY")

axes.plot(unique_date, avg_ac_per_day, color='r', label="AVERAGE AC OUTPUT")
axes.plot(unique_date, avg_dc_per_day, color='b', label="AVERAGE DC OUTPUT")

plt.legend(loc="upper left")

plt.show()

# 10. Which inverter (source_key) has produced maximum DC/AC power?

In [None]:
max_dc_inverter = p1_gen[p1_gen['DC_POWER']== p1_gen["DC_POWER"].max()]['SOURCE_KEY']
print("MAXIMUM DC POWER IS GIVEN BY INVERTER :- ", max_dc_inverter)

print("\n"*4)

max_ac_inverter = p1_gen[p1_gen['AC_POWER']== p1_gen["AC_POWER"].max()]['SOURCE_KEY']
print("MAXIMUM AC POWER IS GIVEN BY INVERTER :- ", max_ac_inverter)

# 11.Rank the inverters based on the DC/AC power they produce

RANKING ON BASE OF AC OUTPUT

In [None]:
def takeSecond(elem):
    return elem[1]

inv = p1_gen['SOURCE_KEY'].unique()
ac_inv =  p1_gen.groupby('SOURCE_KEY')['AC_POWER'].mean()

p_list=[]

for i in range(len(inv)):
     p_list.append([inv[i], ac_inv[i]])
    
p_list.sort(reverse = True ,key=takeSecond)

print("RANKING INVERTERS BASED ON AC OUTPUT\n\n")

for i in range(len(p_list)):
    print("RANK", i+1, ":- ##",p_list[i][0], "## AVERAGE AC OUPUT IS:-", p_list[i][1], "\n" )

RANKING ON BASE OF DC OUTPUT

In [None]:
def takeSecond(elem):
    return elem[1]

inv = p1_gen['SOURCE_KEY'].unique()
dc_inv =  p1_gen.groupby('SOURCE_KEY')['DC_POWER'].mean()

p_list=[]

for i in range(len(inv)):
     p_list.append([inv[i], dc_inv[i]])
    
p_list.sort(reverse = True ,key=takeSecond)

print("RANKING INVERTERS BASED ON DC OUTPUT\n\n")

for i in range(len(p_list)):
    print("RANK", i+1, ":- ##",p_list[i][0], "## AVERAGE DC OUPUT IS:-", p_list[i][1], "\n" )

# Is there any missing data?

In [None]:
print("Ideally 22 Inverters are working for 24 hours(1 day), and we are getting data every 15 min(i.e. 4 times per hour)")

print("Hence we can say ideally there are",22*24*4,"number of data everyday")

In [None]:
print("This is the data collected per day:-\n\n")
p1_gen['DATE'].value_counts().sort_index()



In [None]:
print("Now i will plot the graph for number of data per day\n")
print("And there will be one particular line which corresponds to the ideal data per day")

In [None]:


data_per_day = p1_gen['DATE'].value_counts().sort_index()

unique_date = p1_gen["DATE"].unique()

ideal_data = []
#CREATING IDEAL DATA LIST
for i in range(len(unique_date)):
    ideal_data.append(22*24*4)

fig= plt.figure(figsize=(16,9))

axes= fig.add_axes([0.1,0.1,0.8,0.8])

plt.xlabel("DATE")
plt.ylabel("NUMBER OF DATA")
plt.title("AVERAGE DATA PER DAY")

axes.plot(unique_date, data_per_day, color='r', label="DATA PER DAY")
axes.plot(unique_date,ideal_data , color='b', label="IDEAL DATA PER DAY")


plt.legend(loc="lower right")

plt.show()