# Descriptive Statistics
### Author: Prof. Sandro Camargo <github.com/sandrocamargo>
### Data Mining Course <https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213>
#### This script uses the basic concepts of descriptive statistics.
##### In this script, we used the iris dataset https://archive.ics.uci.edu/dataset/53/iris

To open this notebook in your Google Colab environment, [click here](https://colab.research.google.com/github/Sandrocamargo/data-mining/blob/main/Python/md03_Descriptive_Statistics.ipynb).

In [None]:
# Download and unzip the dataset
!wget -c https://archive.ics.uci.edu/static/public/53/iris.zip
!unzip -u iris.zip

In [None]:
# import and inspect the dataset
import pandas as pd

data = pd.read_csv('iris.data', header=None)
data.columns = ['Sepal Length','Sepal Width','Petal Length','Petal Width','Species']
data.head() # Show first 5 samples

# Computing Central Tendency Measures

In [None]:
# Mean
print("Sepal Length Mean:",data['Sepal Length'].mean(),"\n") # For a single attribute
print(data.iloc[:,0:4].mean()) # For all lines ":", for attributes from 0 to 3
print(data.iloc[:,[0,2]].mean()) # For all lines ":", for attributes 0 and 2

# Median
print("\nSepal Length Median:",data['Sepal Length'].median())
print("Sepal Width Median:",data['Sepal Width'].median())

# Mode
print("\nSepal Length Mode:",data['Sepal Length'].mode())
print("Sepal Width Mode:",data['Sepal Width'].mode())

# Computing Dispersion Measurements

In [None]:
# Minimum
print("Sepal Length Minimum Value:", data['Sepal Length'].max())

# Maximum
print("Sepal Length Maximum Value:", data['Sepal Length'].min())

# Range
print("Sepal Length Range:", data['Sepal Length'].max()-data['Sepal Length'].min())

# Variance
print("Sepal Length Variance:", data['Sepal Length'].var())

# Standard Deviation
print("Sepal Length Standard Deviation:", data['Sepal Length'].std())

# First Quartile
print("Q1:",data['Sepal Length'].quantile(0.25))

# Third Quartile
print("Q3:",data['Sepal Length'].quantile(0.75))

# Inter Quartile Range
print("IQR:",data['Sepal Length'].quantile(0.75)-data['Sepal Length'].quantile(0.25))

In [None]:
# show basic descriptive statistics for all attributes
print(data.describe())

In [None]:
# plot the boxplot of a single variable
from matplotlib import pyplot as plt

plt.boxplot(data['Sepal Width'], labels=['Sepal Width'])
plt.xlabel('(cm)')
print(data['Sepal Width'].describe())

In [None]:
plt.boxplot(data.iloc[:,0:4], labels=data.columns[0:4])
plt.ylabel('Dimension (cm)')

# Measuring the Shape

In [None]:
# Skewness
# skew() > 0 in positive skewed distributions
# skew() = 0.0... in symmetrical distributions
# skew() < 0 in negative skewed distributions
data.iloc[:,0:4].skew()

In [None]:
# Kurtosis
# kurtosis > 0 in leptokurtic distributions
# kurtosis = 0.0... in mesokurtic distributions
# kurtosis < 0 in platykurtic distributions
data.iloc[:,0:4].kurtosis()

In [None]:
# Distribution plot
import seaborn as sns
sns.displot(data['Sepal Length'], kind="kde") # Single attribute
plt.axvline(x = data['Sepal Length'].mean(),
           ymin = 0, # Bottom of the plot
           ymax = 1,  # Top of the plot
           color = 'r')
plt.axvline(x = data['Sepal Length'].median(),
           ymin = 0,
           ymax = 1,
           color = 'g')
plt.axvline(x = data['Sepal Length'].mode()[0],
           ymin = 0,
           ymax = 1,
           color = 'y')
plt.legend(labels = ['Distribution','Mean','Median','Mode'])

In [None]:
# Distribution plot
sns.displot(data['Sepal Width'], kind="kde") # Single attribute
plt.axvline(x = data['Sepal Width'].mean(),
           ymin = 0, # Bottom of the plot
           ymax = 1,  # Top of the plot
           color = 'r')
plt.axvline(x = data['Sepal Width'].median(),
           ymin = 0,
           ymax = 1,
           color = 'g')
plt.axvline(x = data['Sepal Width'].mode()[0],
           ymin = 0,
           ymax = 1,
           color = 'y')
plt.legend(labels = ['Distribution','Mean','Median','Mode'])

In [None]:
sns.displot(data.iloc[:,0:4], kind="kde")
plt.xlabel('Values (cm)')

In [None]:
import scipy.stats # To compute correlation coefficient

# scatterplot
plt.plot(data['Sepal Length'],data['Sepal Width'],'bo')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('Correlation: %1.4f' %scipy.stats.pearsonr(data['Sepal Length'],data['Sepal Width'])[0])

In [None]:
plt.plot(data['Petal Length'],data['Petal Width'],'bo')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.title('Correlation: %1.4f' %scipy.stats.pearsonr(data['Petal Length'],data['Petal Width'])[0])

In [None]:
sns.pairplot(data, hue='Species', markers=["o", "s", "D"])

In [None]:
sns.boxplot(x='Species', y='Petal Width', data=data)

In [None]:
sns.boxplot(x='Species', y='Petal Length', data=data)

In [None]:
sns.boxplot(x='Species', y='Sepal Width', data=data)