#Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as sp

#Data

In [2]:
data = pd.read_csv("./sample_data/california_housing_train.csv")
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [3]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


#Linear Algebra
Scalars, vectors, and matrices

##Matrices
* Shape
* Identity matrix

In [4]:
data.shape

(17000, 9)

In [8]:
i = np.eye(5)
print(i)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


##Vectors
* Vector addition / subtraction
* Scalar * vector multiplication
* Dot product
* Sum of squares / magnitude
* Inverse matrices

In [5]:
data["total_rooms"] - data["total_bedrooms"]

0        4329.0
1        5749.0
2         546.0
3        1164.0
4        1128.0
          ...  
16995    1823.0
16996    1821.0
16997    2146.0
16998    2120.0
16999    1520.0
Length: 17000, dtype: float64

In [7]:
data["total_rooms"] // 4

0        1403.0
1        1912.0
2         180.0
3         375.0
4         363.0
          ...  
16995     554.0
16996     587.0
16997     669.0
16998     668.0
16999     455.0
Name: total_rooms, Length: 17000, dtype: float64

In [9]:
np.dot(data["total_rooms"], data["households"])

35621301399.0

In [10]:
sum_of_squares = np.dot(data["median_house_value"], data["median_house_value"])
magnitude = np.sqrt(sum_of_squares)
print(sum_of_squares, magnitude)

959226879450818.0 30971388.077559877


In [22]:
data_sample = np.array(data[:9])
data_sample_inv = np.linalg.inv(data_sample)
print(data_sample)
print(data_sample_inv)
print(np.around(data_sample.dot(data_sample_inv), decimals=2))

[[-1.1431e+02  3.4190e+01  1.5000e+01  5.6120e+03  1.2830e+03  1.0150e+03
   4.7200e+02  1.4936e+00  6.6900e+04]
 [-1.1447e+02  3.4400e+01  1.9000e+01  7.6500e+03  1.9010e+03  1.1290e+03
   4.6300e+02  1.8200e+00  8.0100e+04]
 [-1.1456e+02  3.3690e+01  1.7000e+01  7.2000e+02  1.7400e+02  3.3300e+02
   1.1700e+02  1.6509e+00  8.5700e+04]
 [-1.1457e+02  3.3640e+01  1.4000e+01  1.5010e+03  3.3700e+02  5.1500e+02
   2.2600e+02  3.1917e+00  7.3400e+04]
 [-1.1457e+02  3.3570e+01  2.0000e+01  1.4540e+03  3.2600e+02  6.2400e+02
   2.6200e+02  1.9250e+00  6.5500e+04]
 [-1.1458e+02  3.3630e+01  2.9000e+01  1.3870e+03  2.3600e+02  6.7100e+02
   2.3900e+02  3.3438e+00  7.4000e+04]
 [-1.1458e+02  3.3610e+01  2.5000e+01  2.9070e+03  6.8000e+02  1.8410e+03
   6.3300e+02  2.6768e+00  8.2400e+04]
 [-1.1459e+02  3.4830e+01  4.1000e+01  8.1200e+02  1.6800e+02  3.7500e+02
   1.5800e+02  1.7083e+00  4.8500e+04]
 [-1.1459e+02  3.3610e+01  3.4000e+01  4.7890e+03  1.1750e+03  3.1340e+03
   1.0560e+03  2.1782e

#Statistics
* Mean, median, mode
* Dispersion
* Correlation

In [None]:
np.ptp(data["median_house_value"])

485002.0

In [None]:
np.mean(data["median_house_value"])

207300.91235294117

In [None]:
np.median(data["median_house_value"])

180400.0

In [None]:
np.std(data["median_house_value"])

115980.35304985354

In [None]:
np.var(data["median_house_value"])

13451442293.56867

In [None]:
np.correlate(data["total_rooms"], data["median_house_value"])

array([9.87958022e+12])

#Probability
* Dependent and independent probability
* Conditional probability
* Bayes' Theorem
* Randomness
* Continuous / discrete distributions
* Probability density functions
* Cumulative distrbution functions
* Central Limit Theorem and Confidence Intervals
