# Advanced Data Analysis

## Group by function 

In [1]:
import pandas as pd 

In [2]:
insurance_data = pd.read_csv("insurance.csv")

In [3]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### Perform Initial Analysis

In [4]:
insurance_data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
insurance_data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [6]:
insurance_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [8]:
insurance_data["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

### Pull Up average Insurance Charges by region

#### Discrete and Continuous Data

##### Discrete - Just Continous / Can't be segmented
##### Continuous - Measurable / Have units / Can be segmented

In [19]:
# In insurance data 
# Discrete - Sex, Children, Smoker, Region
# Continuous - Age, BMI, Charges

In [22]:
# Here, in groupby function - by should have discrete first and then followed by continuous 

In [14]:
insurance_data.groupby(by="region")["charges"].mean().round(2).sort_values()

region
southwest    12346.94
northwest    12417.58
northeast    13406.38
southeast    14735.41
Name: charges, dtype: float64

In [None]:
### Pull Up average Insurance Charges by region and Sex

In [21]:
insurance_data.groupby(by=["region","sex"])["charges"].mean().round(2).sort_values()

region     sex   
southwest  female    11274.41
northwest  male      12354.12
           female    12479.87
northeast  female    12953.20
southwest  male      13412.88
southeast  female    13499.67
northeast  male      13854.01
southeast  male      15879.62
Name: charges, dtype: float64

### Pivot Table

In [25]:
pd.pivot_table( data= insurance_data, values= "charges", index="region", columns="sex").round(2)

sex,female,male
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,12953.2,13854.01
northwest,12479.87,12354.12
southeast,13499.67,15879.62
southwest,11274.41,13412.88


In [27]:
pd.pivot_table( data= insurance_data, values= "charges", index="region", columns=["sex","children","smoker"]).round(2)   # More than one Category

sex,female,female,female,female,female,female,female,female,female,female,...,male,male,male,male,male,male,male,male,male,male
children,0,0,1,1,2,2,3,3,4,5,...,0,1,1,2,2,3,3,4,4,5
smoker,no,yes,no,yes,no,yes,no,yes,no,no,...,yes,no,yes,no,yes,no,yes,no,yes,no
region,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
northeast,8169.64,24834.34,10920.69,31083.68,11937.8,24864.9,8962.8,29223.84,14542.42,,...,29394.86,8912.3,33847.86,11311.29,18357.64,6589.45,37190.63,14442.28,,6978.97
northwest,7560.08,28837.31,8196.13,32308.59,10659.31,29788.51,10187.45,30457.81,11024.42,8965.8,...,27109.42,7808.83,21799.78,9267.69,31741.99,11178.72,38694.56,8186.93,21472.48,
southeast,7746.72,36207.88,7893.36,23376.32,6922.9,34046.01,13683.15,31802.65,18267.1,9923.89,...,34889.05,8174.22,35925.52,8068.38,38993.29,10290.07,33450.01,8726.91,,10306.99
southwest,7221.16,27428.38,8575.39,38036.63,9951.18,36861.55,8438.74,,10945.68,7023.78,...,33134.03,6664.47,32457.95,7588.34,38064.9,8889.83,21616.88,6785.72,29062.18,6864.67
