In [1]:
#Descriptive Statistics is the building block of data science. Advanced analytics is often incomplete without analyzing descriptive statistics of the key metrics. In simple terms, descriptive statistics can be defined as the measures that summarize a given data, and these measures can be broken down further into the measures of central tendency and the measures of dispersion.

#Measures of central tendency include mean, median, and the mode, while the measures of variability include standard deviation, variance, and the interquartile range. In this guide, you will learn how to compute these measures of descriptive statistics and use them to interpret the data.

#We will cover the topics given below:

#Mean
#Median
#Mode
#Standard Deviation
#Variance
#Interquartile Range
#Skewness

In [1]:
import pandas as pd
import numpy as np
import statistics as st 

# Load the data
df = pd.read_csv("loan_data.csv")
print(df.shape)
print(df.info())

(614, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Age                513 non-null    float64
 3   Married            611 non-null    object 
 4   Dependents         599 non-null    object 
 5   Education          614 non-null    object 
 6   Self_Employed      582 non-null    object 
 7   ApplicantIncome    614 non-null    int64  
 8   CoapplicantIncome  614 non-null    float64
 9   LoanAmount         592 non-null    float64
 10  Loan_Amount_Term   600 non-null    float64
 11  Credit_History     564 non-null    float64
 12  Property_Area      614 non-null    object 
 13  Loan_Status        614 non-null    object 
dtypes: float64(5), int64(1), object(8)
memory usage: 67.3+ KB
None


In [3]:
df.mean()

Age                    32.101365
ApplicantIncome      5403.459283
CoapplicantIncome    1621.245798
LoanAmount            146.412162
Loan_Amount_Term      342.000000
Credit_History          0.842199
dtype: float64

In [4]:
print(df.loc[:,'Age'].mean())
print(df.loc[:,'ApplicantIncome'].mean()) 

32.10136452241716
5403.459283387622


In [5]:
df.mean(axis = 1)[0:10]

0    1249.000000
1    1316.000000
2     575.333333
3     908.000000
4    1088.666667
5    1713.500000
6     722.500000
7    1017.166667
8    1017.833333
9    4092.833333
dtype: float64

In [6]:
df.median(axis=1)

0       35.0
1      360.0
2       45.5
3      240.0
4       85.5
       ...  
609     48.0
610     45.0
611    246.5
612    116.0
613     79.5
Length: 614, dtype: float64

In [7]:
df.median()

Age                    30.0
ApplicantIncome      3812.5
CoapplicantIncome    1188.5
LoanAmount            128.0
Loan_Amount_Term      360.0
Credit_History          1.0
dtype: float64

In [8]:
#to compute a median of a some column
print(df.loc[:,'Age'].median())
print(df.loc[:,'ApplicantIncome'].median())

df.median(axis = 1)[0:10]

30.0
3812.5


0     35.0
1    360.0
2     45.5
3    240.0
4     85.5
5    313.5
6    227.5
7    259.0
8    264.0
9    354.5
dtype: float64

In [9]:
df.mode()

Unnamed: 0,Loan_ID,Gender,Age,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,25.0,Yes,0,Graduate,No,2500.0,0.0,120.0,360.0,1.0,Semiurban,Y
1,LP001003,,,,,,,,,,,,,
2,LP001005,,,,,,,,,,,,,
3,LP001006,,,,,,,,,,,,,
4,LP001008,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,,,,,,,,,,,,,
610,LP002979,,,,,,,,,,,,,
611,LP002983,,,,,,,,,,,,,
612,LP002984,,,,,,,,,,,,,


# Measures the Dispersion

In [10]:
#Measure the Standard deviation
df.std()

Age                     7.732178
ApplicantIncome      6109.041673
CoapplicantIncome    2926.248369
LoanAmount             85.587325
Loan_Amount_Term       65.120410
Credit_History          0.364878
dtype: float64

In [12]:
print(df.loc[:,'Age'].std())
print(df.loc[:,'ApplicantIncome'].std())

#calculate the standard deviation of the first five rows 
df.std(axis = 1)[0:10]

7.732178229043358
6109.041673387174


0    2575.928085
1    1921.240355
2    1195.703252
3    1219.011567
4    2409.946528
5    2430.485610
6     974.539224
7    1373.763650
8    1569.760799
9    6081.668239
dtype: float64

In [13]:
#easure the Variance
df.var()

Age                  5.978658e+01
ApplicantIncome      3.732039e+07
CoapplicantIncome    8.562930e+06
LoanAmount           7.325190e+03
Loan_Amount_Term     4.240668e+03
Credit_History       1.331362e-01
dtype: float64

In [14]:
#Measures the Interquartile Range (IQR)
from scipy.stats import iqr
iqr(df['Age'])

nan

In [3]:
print(df.skew())

Age                  0.712146
ApplicantIncome      6.539513
CoapplicantIncome    7.491531
LoanAmount           2.677552
Loan_Amount_Term    -2.362414
Credit_History      -1.882361
dtype: float64


In [4]:
#The skewness values can be interpreted in the following manner:

#Highly skewed distribution: If the skewness value is less than −1 or greater than +1.

#Moderately skewed distribution: If the skewness value is between −1 and −½ or between +½ and +1.

#Approximately symmetric distribution: If the skewness value is between −½ and +½.

In [5]:
df.describe()

Unnamed: 0,Age,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,513.0,614.0,614.0,592.0,600.0,564.0
mean,32.101365,5403.459283,1621.245798,146.412162,342.0,0.842199
std,7.732178,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,24.0,150.0,0.0,9.0,12.0,0.0
25%,25.0,2877.5,0.0,100.0,360.0,1.0
50%,30.0,3812.5,1188.5,128.0,360.0,1.0
75%,38.0,5795.0,2297.25,168.0,360.0,1.0
max,56.0,81000.0,41667.0,700.0,480.0,1.0


In [6]:
df.describe(include='all')

Unnamed: 0,Loan_ID,Gender,Age,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,614,601,513.0,611,599.0,614,582,614.0,614.0,592.0,600.0,564.0,614,614
unique,614,2,,2,4.0,2,2,,,,,,3,2
top,LP001157,Male,,Yes,0.0,Graduate,No,,,,,,Semiurban,Y
freq,1,489,,398,345.0,480,500,,,,,,233,422
mean,,,32.101365,,,,,5403.459283,1621.245798,146.412162,342.0,0.842199,,
std,,,7.732178,,,,,6109.041673,2926.248369,85.587325,65.12041,0.364878,,
min,,,24.0,,,,,150.0,0.0,9.0,12.0,0.0,,
25%,,,25.0,,,,,2877.5,0.0,100.0,360.0,1.0,,
50%,,,30.0,,,,,3812.5,1188.5,128.0,360.0,1.0,,
75%,,,38.0,,,,,5795.0,2297.25,168.0,360.0,1.0,,


In [7]:
df.groupby('Age').count()

Unnamed: 0_level_0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
24.0,22,21,21,19,22,20,22,22,21,21,20,22,22
25.0,107,105,107,105,107,102,107,107,102,104,100,107,107
26.0,87,86,87,87,87,81,87,87,86,86,81,87,87
27.0,23,23,22,22,23,22,23,23,22,21,23,23,23
28.0,4,4,4,4,4,2,4,4,3,3,4,4,4
30.0,53,53,53,52,53,52,53,53,51,52,47,53,53
31.0,23,22,23,20,23,23,23,23,22,23,20,23,23
32.0,18,18,18,18,18,18,18,18,17,17,15,18,18
35.0,7,7,7,6,7,7,7,7,6,6,7,7,7
37.0,18,17,18,18,18,16,18,18,17,18,16,18,18


In [8]:
df.groupby('Age')['ApplicantIncome']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CE88961AF0>

In [9]:
df.groupby('Age')['ApplicantIncome'].sum()

Age
24.0    109185
25.0    538078
26.0    423267
27.0    178232
28.0     12637
30.0    314638
31.0    174155
32.0     73848
35.0     64332
37.0     99041
38.0    170038
40.0     87414
42.0     28751
43.0    205937
45.0    142359
46.0     38018
47.0    138701
50.0      4106
56.0      6540
Name: ApplicantIncome, dtype: int64

In [12]:
data = {'Gender':['m','f','f','m','f','m','m'],'Age':[24,25,26,27,28,30,32]}
df_sample = pd.DataFrame(data)
df_sample

Unnamed: 0,Gender,Age
0,m,24
1,f,25
2,f,26
3,m,27
4,f,28
5,m,30
6,m,32


In [13]:
f_filter = df_sample['Gender']=='f'
print(df_sample[f_filter])

m_filter = df_sample['Gender']=='m'
print(df_sample[m_filter])

  Gender  Age
1      f   25
2      f   26
4      f   28
  Gender  Age
0      m   24
3      m   27
5      m   30
6      m   32
