In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Data source:
# https://archive.ics.uci.edu/ml/machine-learning-databases/housing/
df = pd.read_csv(r"housing.data", sep=' +', engine='python', header=None, names=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS', 'RAD','TAX','PTRATIO','B','LSTAT','MEDV'])
df

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
sns.boxplot(x=df['CHAS'], y=df['MEDV']) # https://mfiles.pl/pl/index.php/Wykres_pude%C5%82kowy
# Boxplot depicts data distribution divided into quartiles with distinction about mean and most sticking out values, on the both ends of scale min-max.
# Horizontal straight line inside box is median. Middle value in research sample.
# Below median is 25 to 50 percentile
# Above median is 50 to 75 percentile
# Bottom border of boxplot quartile 1 - Q1
# Upper border of boxplot quartile 3 - Q3
# Whiskers indicates, gives us informations about outliers, values which do not fit into whole research trial.
# Whiskers gives us also min and max value.
# Dots above or below are mentioned outliers.
# https://de.wikipedia.org/wiki/Box-Plot

In [None]:
sns.barplot(x=df['CHAS'], y=df['MEDV']) # we can see analogies with plots in the cell above
# We get average value and also get calculation error (standard deviation???) with this thick vertical line.

In [None]:
columns = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS', 'RAD','TAX','PTRATIO','B','LSTAT','MEDV']
corr_matrix = np.corrcoef(df.values.T)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(data=corr_matrix, annot=True, square = True, fmt='.2f', xticklabels=columns, yticklabels=columns)
# https://corporatefinanceinstitute.com/resources/excel/correlation-matrix/#:~:text=A%20correlation%20matrix%20is%20simply,patterns%20in%20the%20given%20data.
# Quadratic and symmetric matrix, which represents value of correlation coefficient.
# Takes values between <-1; 1>.
# Correlation matrix gives us details about how much 2 attributes are correlated with each other.
# Negative correlation: if one value rises and second peaks down.
# Positive correlation: if both rises.
# 1.00 <- maximal positive correlation between two attributes
# -1.00 <- maximal negative correlation between two attributes
# 0.00 <- the correlation doesnt occure

In [None]:
sns.pairplot(df[columns])
# https://www.geeksforgeeks.org/python-seaborn-pairplot-method/
# Pairplot gives us collation similar to Correlation matrix. But in this case instead of numbers we get subplots that describes correlation between datasets.

In [None]:
columns = ['LSTAT', 'RM', 'PTRATIO', 'INDUS','TAX','NOX','MEDV']
fig, ax = plt.subplots(figsize=(12, 12))
sns.pairplot(df[columns])
# ??? weird

In [None]:
print("pairplot finished")