![](https://puu.sh/HLHgF/324c7fedc3.png)


Most important basis :
- [x] Python
	- [x] Data manipulation using Pandas & Numpy
	- [x] Data visualization using Matplotlib
- [x] SQL
	- [x] SQL & Pandas & SQLite (Good course : https://medium.com/analytics-vidhya/programming-with-databases-in-python-using-sqlite-4cecbef51ab9)
	- platforms like Mode Analytics and Databricks to easily work with Python and SQL.
- Statistics basis 
	- Learn : Sampling, frequency distributions, Mean, Median, Mode, Measure of variability, Probability basics, significant testing, standard deviation, z-scores, confidence intervals, and hypothesis testing (including A/B testing)
	- Good Handbook (especially first four chapters) : https://www.amazon.com/Practical-Statistics-Data-Scientists-Essential/dp/9352135652
	- Learn *StatsModels* Python Library + good video course : https://www.youtube.com/watch?v=yaSgoGLXKOg
- Machine Learning using Scikit-Learn (Good video course by Andrew Ng : https://www.coursera.org/learn/machine-learning + Good Handbook and exercise : https://www.amazon.com/Hands-Machine-Learning-Scikit-Learn-TensorFlow/dp/1491962291 )

Next :
- big data technologies like Spark and Hadoop
- Google Analytics 
- Python data viz using **Seaborn**, Bokeh, Pygal

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [54]:
# Generate artificial data
size = 100  # Output shape
loc = 50    # Mean (“centre”) of the distribution
scale = 5   # Standard deviation (spread or “width”) of the distribution. Must be non-negative.


X = np.random.normal(loc, scale, size)

# Y = np.random.randint(10, size=100)
# Y = np.random.binomial(n=100, p=.5, size=100)
Y = stats.norm.ppf(np.random.random(size), loc=loc, scale=scale).astype(int)

X, Y

(array([55.75608173, 52.81238965, 55.62111732, 49.89528542, 56.05333946,
        41.22825435, 55.48902971, 48.39043528, 51.42119142, 49.98761722,
        50.37643254, 59.28903873, 48.06409926, 51.71440901, 47.47399008,
        44.98892566, 50.89196949, 54.29815175, 52.66891552, 48.14687677,
        48.85774544, 44.7906864 , 57.52047777, 48.30737622, 49.52319116,
        45.86263123, 46.38149512, 38.33135014, 48.22734687, 53.02997041,
        46.82664881, 56.74863347, 36.50340832, 57.77302102, 59.83695815,
        58.66702393, 40.17525716, 46.75294991, 45.40265986, 60.06444145,
        44.76302813, 52.85506209, 58.24064082, 54.23672568, 48.372648  ,
        50.39410706, 55.4292295 , 49.05461773, 55.17942781, 53.98496247,
        45.27885551, 55.84814236, 56.49557576, 52.77333886, 48.73193272,
        48.8257024 , 49.0600586 , 39.64540498, 58.37394186, 53.60776   ,
        44.99844929, 51.42698252, 49.73851062, 41.21045468, 43.46278467,
        47.10338127, 46.59610411, 51.55468165, 49.7

In [3]:
# Mean
print('X mean : %.3f' % X.mean())
print('Y mean : %.3f' % Y.mean())

X mean : 0.975
Y mean : 50.000


In [4]:
# Median
print('X median : %f' % np.median(X))
print('Y median : %d' % np.median(Y))

X median : 0.984901
Y median : 49


In [5]:
# Sampling
# We can use numpy random.choice() or take a value from the array each n values
np.random.choice(X, 10), np.random.choice(Y, 10)

(array([1.04442572, 1.06060007, 0.81992574, 1.25394092, 0.91576459,
        0.8708394 , 0.68022311, 0.51832   , 0.9841502 , 0.65597501]),
 array([64, 44, 55, 55, 37, 45, 45, 44, 40, 43]))

In [41]:
# frequency distributions
unique, counts = np.unique(Y, return_counts=True)
pd.DataFrame({'counts': counts}, index=unique).head()

Unnamed: 0,counts
22,1
30,2
31,1
32,1
33,4


In [24]:
# frequency distributions on continous
def continous_frequency_distributions(array, parts):
    min_val = array.min()
    max_val = array.max()
    steps = np.arange(start=min_val, stop=max_val, step=(max_val-min_val)/parts)
    ranges = []
    counts = []
    
    for i in range(parts-1):
        ranges.append(str('%.3f' % steps[i]) + ' to ' + str('%.3f' % steps[i+1]))
        counts.append(((steps[i] <= array) & (steps[i+1] > array)).sum())
        
    ranges.append(str('%.3f' % steps[i+1]) + ' and more')
    counts.append(((steps[i+1] <= array) & (max_val >= array)).sum())
    
    return pd.DataFrame({'counts': counts}, index=ranges)


continous_frequency_distributions(X, 10)

Unnamed: 0,counts
0.518 to 0.609,2
0.609 to 0.699,8
0.699 to 0.790,7
0.790 to 0.880,16
0.880 to 0.970,13
0.970 to 1.061,20
1.061 to 1.151,15
1.151 to 1.241,12
1.241 to 1.332,3
1.332 and more,4


In [37]:
# Mode
print('For discrete :')
print(stats.mode(Y))

print('\nFor continous :')
distrib = continous_frequency_distributions(X, 10)
print(distrib.loc[distrib.counts.idxmax()])

For discrete :
ModeResult(mode=array([47]), count=array([7]))

For continous :
counts    20
Name: 0.970 to 1.061, dtype: int64


In [45]:
# Measure of variability
print('X variance : %.3f' % np.var(X))
print('Y variance : %.3f' % np.var(Y))

X variance : 0.037
Y variance : 109.340


In [48]:
# standard deviation
print('X deviation : %.3f' % np.std(X))  # or print('X deviation : %.3f' % np.var(X)**.5)
print('Y deviation : %.3f' % np.std(Y))  # or print('Y deviation : %.3f' % np.var(Y)**.5)

X deviation : 0.191
Y deviation : 10.457


In [52]:
# z-scores
stats.zscore(X), stats.zscore(Y)

(array([ 0.23280414,  1.33243607,  0.49577916, -0.92418258, -1.17824276,
         1.10513   , -1.69266842, -0.30751556,  0.0461295 , -0.94610462,
        -0.99115016, -1.77078796,  1.57808688, -0.66137017,  0.63121064,
         0.9188453 , -1.83580299, -0.74877729, -0.45030274, -1.35585434,
        -0.09005113, -1.53596331,  0.95344844,  0.62995115,  0.84093276,
         0.87426684,  0.69098409, -1.50902886, -2.38856301, -0.55683417,
        -0.39340673,  1.11984537,  0.91250666,  0.82242233,  0.09994879,
         1.05416861, -0.542974  ,  0.31988675,  0.74630856,  0.37828739,
        -0.09417247,  0.3923616 , -0.31129244,  0.30131909,  2.33593489,
         1.8784122 , -1.66909992,  0.0081877 , -0.49785344,  0.4303887 ,
         0.65002867,  1.17968167, -0.0197411 , -0.05186291, -0.81220052,
        -1.12518691,  1.67598644, -0.98618553,  0.81623257, -0.54609691,
         0.37065767, -0.49465347, -1.2761802 , -0.96392332, -1.54236561,
         1.94918211,  1.95749146,  1.45620866,  1.1

In [57]:
# confidence intervals

#create 95% confidence interval for population mean weight
print(stats.t.interval(alpha=0.95, df=size, loc=loc, scale=scale))

#create 99% confidence interval for same sample
print(stats.t.interval(alpha=0.99, df=size, loc=loc, scale=scale))

(40.080142407751836, 59.919857592248164)
(36.870547397068336, 63.129452602931664)


In [None]:
# Probability basics

# significant testing

# hypothesis testing (including A/B testing)