In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
csv_filesPATH = glob.glob(pathname='../../pandas-workout-data/data/*,*')
csv_filesPATH

['../../pandas-workout-data/data\\albany,ny.csv',
 '../../pandas-workout-data/data\\boston,ma.csv',
 '../../pandas-workout-data/data\\chicago,il.csv',
 '../../pandas-workout-data/data\\los+angeles,ca.csv',
 '../../pandas-workout-data/data\\new+york,ny.csv',
 '../../pandas-workout-data/data\\san+francisco,ca.csv',
 '../../pandas-workout-data/data\\springfield,il.csv',
 '../../pandas-workout-data/data\\springfield,ma.csv']

In [3]:
list_df = []
for path in csv_filesPATH:
    print(f'Reading csv from ==> {path}')
    city_name = path.split('/')[-1].replace('data\\', '').replace('.csv', '') # getting just the city name
    temp_max_column = city_name + '_maxtempC'
    temp_min_column = city_name + '_mintempC'
    precip_column = city_name +'_precipMM'
    city_name = city_name.split(',') 
    df = (pd.read_csv(filepath_or_buffer=path, usecols=[temp_max_column, temp_min_column, precip_column])
          .assign(city = city_name[0].replace('+', ' ').title(), state=city_name[1].upper()))
    # changing the names os the column to get uniform columns names to concat
    names = {                           
        temp_max_column: 'max_temp',
        temp_min_column: 'min_temp',
        precip_column: 'precipMM'
    }
    df = df.rename(columns=names)
    list_df.append(df)

Reading csv from ==> ../../pandas-workout-data/data\albany,ny.csv
Reading csv from ==> ../../pandas-workout-data/data\boston,ma.csv
Reading csv from ==> ../../pandas-workout-data/data\chicago,il.csv
Reading csv from ==> ../../pandas-workout-data/data\los+angeles,ca.csv
Reading csv from ==> ../../pandas-workout-data/data\new+york,ny.csv
Reading csv from ==> ../../pandas-workout-data/data\san+francisco,ca.csv
Reading csv from ==> ../../pandas-workout-data/data\springfield,il.csv
Reading csv from ==> ../../pandas-workout-data/data\springfield,ma.csv


In [4]:
df = pd.concat(list_df)
df

Unnamed: 0,max_temp,min_temp,precipMM,city,state
0,-2,-8,0.0,Albany,NY
1,-2,-8,0.0,Albany,NY
2,-2,-8,0.0,Albany,NY
3,-2,-8,0.0,Albany,NY
4,-2,-8,0.0,Albany,NY
...,...,...,...,...,...
723,5,-2,0.0,Springfield,MA
724,5,-2,0.0,Springfield,MA
725,5,-2,0.0,Springfield,MA
726,5,-2,0.0,Springfield,MA


In [5]:
df['city'].unique()

array(['Albany', 'Boston', 'Chicago', 'Los Angeles', 'New York',
       'San Francisco', 'Springfield'], dtype=object)

Determine which cities had, on at least three occasions, precipitation of 15 mm or more.

In [35]:
# This functions is a little hard-coding
# def at_least_three_occasion_precepitation_15mm_or_more(df):
#     return (df['precipMM'] >= 15).sum() >= 3

In [36]:
def at_least_three_occasion_precepitation_15mm_or_more(mini_df, min_mm, times):
    return (mini_df['precipMM'] >= min_mm).sum() >= times

In [49]:
# df.groupby('city') for this case we can't do this because there are two cities called Springfield one in Illinois and the other in Massachussets
df.groupby(['city', 'state'])
df.groupby(['city', 'state']).filter(lambda x: (x['precipMM'] >= 15).sum() >= 3)

Unnamed: 0,max_temp,min_temp,precipMM,city,state
0,1,-4,0.0,Boston,MA
1,1,-4,0.0,Boston,MA
2,1,-4,0.0,Boston,MA
3,1,-4,0.0,Boston,MA
4,1,-4,0.0,Boston,MA
...,...,...,...,...,...
723,6,2,0.0,New York,NY
724,6,2,0.0,New York,NY
725,6,2,0.0,New York,NY
726,6,2,0.0,New York,NY


In [50]:
df.groupby(['city', 'state']).filter(at_least_three_occasion_precepitation_15mm_or_more, min_mm =15, times=3)[['city', 'state']].drop_duplicates()

Unnamed: 0,city,state
0,Boston,MA
0,Los Angeles,CA
0,New York,NY


In [51]:
df.groupby(['city', 'state']).filter(lambda x: (x['precipMM'] >= 15).sum() >= 3)['city'].unique()

array(['Boston', 'Los Angeles', 'New York'], dtype=object)

Find cities that had at least three measurements of 10 mm of precipitation or more when the temperature was at or below 0° Celsius.

In [70]:
def has_multiple_readings_at_least(mini_df, min_precipMM, times):
    return mini_df.loc[
        (mini_df['min_temp'] <= 0) & (mini_df['precipMM'] >= min_precipMM), 'precipMM'
        ].count() >= times

In [71]:
(
    df.groupby(['city', 'state'])
    .filter(has_multiple_readings_at_least, min_precipMM=10, times=3)['city']
    .unique())

array(['Albany', 'Boston', 'New York'], dtype=object)

For each precipitation measurement, calculate the proportion of that city’s total precipitation.

In [75]:
def proportion_of_city_precip(s):
    return s/s.sum()

In [76]:
df.groupby('city')['precipMM'].transform(proportion_of_city_precip)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
723    0.0
724    0.0
725    0.0
726    0.0
727    0.0
Name: precipMM, Length: 5824, dtype: float64

In [79]:
df['precip_proportion'] = df.groupby('city')['precipMM'].transform(proportion_of_city_precip)
df

Unnamed: 0,max_temp,min_temp,precipMM,city,state,precip_proportion
0,-2,-8,0.0,Albany,NY,0.0
1,-2,-8,0.0,Albany,NY,0.0
2,-2,-8,0.0,Albany,NY,0.0
3,-2,-8,0.0,Albany,NY,0.0
4,-2,-8,0.0,Albany,NY,0.0
...,...,...,...,...,...,...
723,5,-2,0.0,Springfield,MA,0.0
724,5,-2,0.0,Springfield,MA,0.0
725,5,-2,0.0,Springfield,MA,0.0
726,5,-2,0.0,Springfield,MA,0.0


For each city, determine the greatest proportion of that city’s total precipitation to fall in a given period.

In [81]:
df.groupby(['city', 'state'])['precip_proportion'].max()


city           state
Albany         NY       0.029228
Boston         MA       0.048302
Chicago        IL       0.057257
Los Angeles    CA       0.059242
New York       NY       0.055149
San Francisco  CA       0.056509
Springfield    IL       0.030977
               MA       0.023459
Name: precip_proportion, dtype: float64