In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Merging the DataFrames
1. Combine all vegetation_average.csv, temperatures.csv, and wear csv using the <b>name</b> column as the primary key
2. All recorded temperatures will need to be converted to probabilities
    - Round to the nearest hundredth
    - then divide by 100
    - if the value is greater than or equal to 1, then assign the probability of 1.0
    - ex) 80.190096 -> 0.8019 
    - ex) 101.0231 -> 1.0
3. This combined dataframe should contain the following columns: <b>name, length, wear, vegetation, and 6-1 -> 8-31 </b>for a total of 97 columns


In [2]:
temp = pd.read_csv('temperature_data.csv').rename(columns={'Unnamed: 0':'name'})
temp.iloc[:, 1:] = temp.iloc[:, 1:].round(decimals=0).div(100)
temp.iloc[:, 1:] = temp.iloc[:, 1:].mask(temp.iloc[:, 1:]>1, other = 1.0)
temp

Unnamed: 0,name,6-1,6-2,6-3,6-4,6-5,6-6,6-7,6-8,6-9,...,8-22,8-23,8-24,8-25,8-26,8-27,8-28,8-29,8-30,8-31
0,Humberto Perez Line,0.80,0.93,0.71,0.76,0.88,0.91,0.82,0.74,0.78,...,0.92,0.90,0.95,0.98,1.00,0.79,0.78,0.83,0.83,1.00
1,Kyle Bradford Line,0.95,0.78,0.95,0.87,0.74,0.97,0.74,0.92,0.91,...,0.97,1.00,0.92,0.77,0.85,1.00,0.79,0.95,0.86,0.91
2,Daniel Gonzalez Line,0.89,0.87,0.96,0.80,0.72,0.82,0.80,0.73,0.96,...,0.93,0.79,0.74,0.93,0.99,0.77,0.84,0.77,0.93,0.90
3,Cheryl White Line,0.95,0.89,0.73,0.72,0.71,0.91,0.77,0.90,0.83,...,0.77,0.98,0.84,0.80,0.80,0.81,1.00,0.81,0.99,0.87
4,Gordon Atkins Line,0.71,0.77,0.81,0.71,0.93,0.77,0.87,0.83,0.84,...,0.94,0.87,0.83,0.97,0.75,0.86,1.00,0.85,0.91,0.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,Ian Correa Line,0.87,0.95,0.84,0.74,0.94,0.72,0.73,0.85,0.88,...,0.86,0.82,0.86,0.81,0.84,0.79,0.80,1.00,0.83,0.85
1796,Thomas Guinn Line,0.73,0.73,0.70,0.77,0.93,0.80,0.88,0.75,0.94,...,1.00,0.92,0.97,0.82,0.75,1.00,0.74,0.83,1.00,0.83
1797,Charles Sparks Line,0.96,0.92,0.87,0.80,0.77,0.70,0.85,0.78,0.81,...,0.99,0.85,1.00,0.77,0.75,0.83,0.87,1.00,0.98,0.92
1798,Linda Santos Line,0.77,0.74,0.85,0.90,0.84,0.77,0.75,0.89,0.71,...,0.95,0.94,0.84,0.88,0.86,0.79,1.00,0.79,0.88,0.93


In [3]:
wear = pd.read_csv('wear_data.csv').iloc[:,1:]
wear

Unnamed: 0,name,length,wear
0,Humberto Perez Line,short,0.325903
1,Kyle Bradford Line,short,0.076689
2,Daniel Gonzalez Line,short,0.572537
3,Cheryl White Line,long,0.958884
4,Gordon Atkins Line,medium,0.324468
...,...,...,...
1795,Ian Correa Line,medium,0.359471
1796,Thomas Guinn Line,medium,0.444667
1797,Charles Sparks Line,medium,0.748239
1798,Linda Santos Line,long,0.652229


In [4]:
veg = pd.read_csv('vegetation_average.csv').iloc[:, 1:]
veg.columns = veg.columns.str.strip()
veg = veg.transpose().reset_index()
veg.rename(columns={'index':'name', veg.columns[1]:'vegetation'}, inplace=True)
veg

Unnamed: 0,name,vegetation
0,Humberto Perez Line,0.373407
1,Kyle Bradford Line,0.051829
2,Daniel Gonzalez Line,0.954363
3,Cheryl White Line,0.249980
4,Gordon Atkins Line,0.583971
...,...,...
1795,Ian Correa Line,0.796892
1796,Thomas Guinn Line,0.383517
1797,Charles Sparks Line,0.400369
1798,Linda Santos Line,0.233560


In [5]:
merged_df = wear.merge(veg, how='outer', on='name')
merged_df = merged_df.merge(temp, how='outer', on='name')
merged_df

Unnamed: 0,name,length,wear,vegetation,6-1,6-2,6-3,6-4,6-5,6-6,...,8-22,8-23,8-24,8-25,8-26,8-27,8-28,8-29,8-30,8-31
0,Humberto Perez Line,short,0.325903,0.373407,0.80,0.93,0.71,0.76,0.88,0.91,...,0.92,0.90,0.95,0.98,1.00,0.79,0.78,0.83,0.83,1.00
1,Kyle Bradford Line,short,0.076689,0.051829,0.95,0.78,0.95,0.87,0.74,0.97,...,0.97,1.00,0.92,0.77,0.85,1.00,0.79,0.95,0.86,0.91
2,Daniel Gonzalez Line,short,0.572537,0.954363,0.89,0.87,0.96,0.80,0.72,0.82,...,0.93,0.79,0.74,0.93,0.99,0.77,0.84,0.77,0.93,0.90
3,Cheryl White Line,long,0.958884,0.249980,0.95,0.89,0.73,0.72,0.71,0.91,...,0.77,0.98,0.84,0.80,0.80,0.81,1.00,0.81,0.99,0.87
4,Gordon Atkins Line,medium,0.324468,0.583971,0.71,0.77,0.81,0.71,0.93,0.77,...,0.94,0.87,0.83,0.97,0.75,0.86,1.00,0.85,0.91,0.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,Ian Correa Line,medium,0.359471,0.796892,0.87,0.95,0.84,0.74,0.94,0.72,...,0.86,0.82,0.86,0.81,0.84,0.79,0.80,1.00,0.83,0.85
1796,Thomas Guinn Line,medium,0.444667,0.383517,0.73,0.73,0.70,0.77,0.93,0.80,...,1.00,0.92,0.97,0.82,0.75,1.00,0.74,0.83,1.00,0.83
1797,Charles Sparks Line,medium,0.748239,0.400369,0.96,0.92,0.87,0.80,0.77,0.70,...,0.99,0.85,1.00,0.77,0.75,0.83,0.87,1.00,0.98,0.92
1798,Linda Santos Line,long,0.652229,0.233560,0.77,0.74,0.85,0.90,0.84,0.77,...,0.95,0.94,0.84,0.88,0.86,0.79,1.00,0.79,0.88,0.93


## Creating FireLineRisk class
This class should accept one argument called df that is the dataframe created above. You will refer to this dataframe as self.df throughout the class

1. Create a method called <b>calculate_risk_by_day</b>
    - it must accept vegetation, wear, weather and return the product of the 3
2. Create method called <b>create_risk_df</b>
    - it will generate a dataframe that will be stored as a class attribute called <b>risk_df</b>, it will return nothing.
    - A probability must be calculated for each day using using the formula from step 1
    - this new dataframe should have a column for <b>name, length, and  6-1 -> 8-31</b>
3. Create a method called <b>show_probability_histogram</b>
    - Using matplotlib create a histogram based on the average probabilities of each line
        - remember each line will contain 93 probabilities, it is the average of these values
    - add a parameter called <b>with_average</b> and set it equal to False by default
    - when this parameter is True draw a vertical line where the average of all averages is located
    - Even if with_average is False, generate the average value is it will be stored in your title
    - add this title "Fire Probability by Frequency (Average of {<b>the average you have calculated</b>})"
    - the label of y the axis "Frequency"
    - the label of the x axis "Fire Probability"
4. Create a method called <b>plot_by_length</b>, a plot with three lines, that will contain a list of average probabilities based on <b>length</b> ( short/medium/long )
    - sort the data from least to greatest and plot
    - all lines should be in a different color
    - include a legend that will distinguish which lines belong to what length ( short/medium/long )
    - title: "Probabilities By Line", x axis title: "Lines", y axis title: "Fire Probability"
5. There is a probability threshold of 0.35, when probabilities are greater than this value, the company is forced to shut off power to avoid any fires. This costs <b>150 dollars</b> on a per line basis
    - create a method called <b>total_cost</b> that takes a parameter <b>threshold</b> that is set to 0.35 by default
    - return the total cost for all lines and days
6. Determine the total cost by month and create a bar chart
    - create a method called <b>total_cost_by_month</b> that shows a bar chart for each month and returns dictionary where month ("june", "july", and "august") are the keys and the values are the sum
    - add labels to the chart that best describe each axis and title
    - make sure each are a different color
   
--- 

7. Vegetation Management plays a huge role in mitigating fires. Trees that collapse on powerlines may ignite massive fires and cause catastrophic damages.
    - Performing this mitigation action reduces the <b>vegetation probability in half</b> at a large cost of <b>50,000 dollars per line</b>
    - Determine which lines need vegetation management to save the most money
8. The company has set the threshold to 0.2, but now vegetation management cost will vary based on length. The users have the ability to choose mitigation of the current lines <b>size or smaller</b>. Meaning if I have a large line I can decide to manage a smaller portion of the line for a smaller fee and for a smaller impact on the vegetation probability
    - small = 15,000 dollars, mitigation impact (small = 0.5, medium = 0.35, large=0.2)
    - medium = 30,000 dollars, mitigation impact (medium = 0.5, large=0.25)
    - large = 50,000 dollars, mitigation impact (large = 0.5)
    
    ex) large line, vegetation probability of 0.8
        - at large vegetation management for 50,000 dollars we reduce the probability of vegetation to 0.4 ((1-0.5) * 0.8)
        - at medium vegetation management for 30,000 dollars we reduce the probability of vegetation to 0.52 ((1-0.35) * 0.8)
        - at small vegetation management for 15,000 dollars we reduce the probability of vegetation to 0.64 ((1-0.2) * 0.8)
        
    - using all of this information, determine the overall cost before vegetation management where the threshold is 0.2
    - determine the lowest cost based on optimal use of the vegetation management system
    - create a dictionary for each line with the following keys and them to list, then create a dataframe with this information
       - original cost
       - reduced cost
       - length
       - mitigation size
    - Fill original cost and reduced cost with the same value if they are identical and set mitigation size to null if no mitigation was performed <b>Mitigation = Vegetation Management</b>


In [6]:
class FireLineRisk:
    
    def __init__(self, df):
        self.df = df
        
    def calculate_risk_by_day(vegetation, wear, weather):
        
        return vegetation*wear*weather
    
    def create_risk_df():
        pd.df = 
    
    def show_probability_histogram(with_average = False):
        
        return
    
    def plot_by_length():
        
        return
    
    def total_cost(threshold=0.35):
        
        return
    
    def total_cost_by_month():
        
        return
        