Descriptive Statistics - Measures of Central Tendency and variability
Perform the following operations on any open source dataset (e.g., data.csv)
1. Provide summary statistics (mean, median, minimum, maximum, standard 
deviation) for a dataset (age, income etc.) with numeric variables grouped by one of 
the qualitative (categorical) variable. For example, if your categorical variable is age
groups and quantitative variable is income, then provide summary statistics of
income grouped bythe age groups. Create a list that contains a numeric value for 
each response to the categorical variable.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [16]:
df = pd.read_csv("employee_data.csv")

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,groups,age,healthy_eating,active_lifestyle,salary
0,0,0,A,36,5,5,2297
1,1,1,A,55,3,5,1134
2,2,2,A,61,8,1,4969
3,3,3,O,29,3,6,902
4,4,4,O,34,6,2,3574


# 1.Mean

In [37]:
df.select_dtypes(include=['float64', 'int64']).mean()

Unnamed: 0           499.500
id                   499.500
age                   41.155
healthy_eating         4.944
active_lifestyle       5.683
salary              2227.461
dtype: float64

In [26]:
df.loc[:,'age'].mean()

41.155

In [36]:
df.select_dtypes(include=['float64', 'int64']).mean(axis=1)[0:4]

0    390.500000
1    199.833333
2    840.500000
3    157.666667
dtype: float64

# 2.Median

In [35]:
df.select_dtypes(include=['float64', 'int64']).median()


Unnamed: 0           499.5
id                   499.5
age                   41.0
healthy_eating         5.0
active_lifestyle       6.0
salary              2174.0
dtype: float64

In [29]:
df.loc[:,'age'].median()

41.0

In [38]:
df.select_dtypes(include=['float64', 'int64']).median(axis=1)[0:4]

0    5.0
1    4.0
2    5.0
3    4.5
dtype: float64

# 3.Mode

In [31]:
df.mode()

Unnamed: 0.1,Unnamed: 0,id,groups,age,healthy_eating,active_lifestyle,salary
0,0,0,A,62.0,5.0,6.0,2646.0
1,1,1,O,,,,
2,2,2,,,,,
3,3,3,,,,,
4,4,4,,,,,
...,...,...,...,...,...,...,...
995,995,995,,,,,
996,996,996,,,,,
997,997,997,,,,,
998,998,998,,,,,


In [33]:
df.loc[:,'salary'].mode()

0    2646
Name: salary, dtype: int64


# 1.Minimum

In [39]:
df.min()

Unnamed: 0            0
id                    0
groups                A
age                  18
healthy_eating        0
active_lifestyle      0
salary              553
dtype: object

In [40]:
df.loc[:,'healthy_eating'].min(skipna = False)

0

# 2.Maximum

In [41]:
df.max()

Unnamed: 0           999
id                   999
groups                 O
age                   64
healthy_eating        10
active_lifestyle      10
salary              5550
dtype: object

In [42]:
df.loc[:,'healthy_eating'].max(skipna = False)

10

# 3. Standard Deviation

In [43]:
df.select_dtypes(include=['float64', 'int64']).std()

Unnamed: 0           288.819436
id                   288.819436
age                   13.462995
healthy_eating         2.013186
active_lifestyle       2.048587
salary              1080.209760
dtype: float64

In [44]:
df.loc[:,'age'].std()

13.46299473472478

In [45]:
df.select_dtypes(include=['float64', 'int64']).std(axis=1)[0:4]

0     934.089236
1     458.130294
2    2022.677112
3     364.789071
dtype: float64

# Grouped by

In [47]:
df.groupby(['groups'])['age'].mean()

groups
A     41.029333
AB    42.976000
B     42.048000
O     40.376000
Name: age, dtype: float64

In [48]:
df_u=df.rename(columns= {'salary':'Income'},inplace=False)
(df_u.groupby(['groups']).Income.mean())

groups
A     2176.221333
AB    2261.688000
B     2232.776000
O     2265.520000
Name: Income, dtype: float64

In [50]:
enc = preprocessing.OneHotEncoder()
enc_df = pd.DataFrame(enc.fit_transform(df[['active_lifestyle']]).toarray())
enc_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
from sklearn import preprocessing
enc = preprocessing.LabelEncoder()
df['groups']= enc.fit_transform(df['groups'])
dfl=df
df

Unnamed: 0.1,Unnamed: 0,id,groups,age,healthy_eating,active_lifestyle,salary
0,0,0,0,36,5,5,2297
1,1,1,0,55,3,5,1134
2,2,2,0,61,8,1,4969
3,3,3,3,29,3,6,902
4,4,4,3,34,6,2,3574
...,...,...,...,...,...,...,...
995,995,995,3,33,7,7,2996
996,996,996,3,21,1,2,667
997,997,997,3,49,9,7,4158
998,998,998,1,56,6,7,2414


In [55]:
df_encode =df_u.join(enc_df)
df_encode.head()

Unnamed: 0.1,Unnamed: 0,id,groups,age,healthy_eating,active_lifestyle,Income,0,1,2,3,4,5,6,7,8,9,10
0,0,0,A,36,5,5,2297,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,1,A,55,3,5,1134,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,2,A,61,8,1,4969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,O,29,3,6,902,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,4,O,34,6,2,3574,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
