Maximum likelihood estimation

In [1]:
from os import listdir
import numpy as np
import pandas as pd
from scipy.stats import norm

In [2]:
pathR='data/ChunkedData_R/'
pathN='data/ChunkedData_NR/'
filesR = listdir(pathR)
filesN = listdir(pathN)
filesR[0]

'Daily_2060_S3.csv'

In [3]:
# Read one CSV file. Drop the date column.
def file_mean (filepath):
    mydata = pd.read_csv(filepath)
    # Drop the date column.
    mydata = mydata.drop('Date',axis=1) 
    # Transpose column of mean values into a row.
    mymean = mydata.mean(axis=0).to_frame().T
    return mymean

In [4]:
# Read directory of CSV files. Retain only one row per file = column averages.
def mean_per_file (directory):
    files = listdir(directory)
    means = pd.DataFrame()
    for fp in files:
        dfp = directory+fp
        m = file_mean(dfp)
        # Let Pandas number the rows sequentially.
        means = means.append(m,ignore_index=True)
    return means

In [5]:
meansR = mean_per_file(pathR)
meansN = mean_per_file(pathN)
meansAll = pd.concat((meansR,meansN),ignore_index=True)

In [6]:
def make_question_features(X):
    c = X.columns[:13] # Excel columns B-N
    z = X[c]
    return np.asarray(z)
def make_physiol_features(X):
    c = X.columns[249:] # Excel columns IQ-IZ
    z = X[c]
    return np.asarray(z)
X_questionFeatures_R = make_question_features(meansR)
X_questionFeatures_N = make_question_features(meansN)
X_physiolFeatures_R = make_physiol_features(meansR)
X_physiolFeatures_N = make_physiol_features(meansN)

In [7]:
# Mean vector should have 13 features
np.mean(X_questionFeatures_R,axis=0)

array([2.77162962, 2.70476552, 3.68423888, 3.42797521, 2.39332778,
       4.36899756, 3.6862858 , 3.51347216, 2.73680122, 2.42942592,
       3.47397681, 2.99122293, 2.72413645])

In [8]:
# Confirm the mean for the first feature
s = 0
num = len(X_questionFeatures_R)
for vec in X_questionFeatures_R:
    v = vec[0]
    s = s + v
print ("mean=",s/num)

mean= 2.771629620151788


In [9]:
# Get four mean vectors
mean_Q_R = np.mean(X_questionFeatures_R,axis=0)
mean_Q_N = np.mean(X_questionFeatures_N,axis=0)
mean_P_R = np.mean(X_physiolFeatures_R,axis=0)
mean_P_N = np.mean(X_physiolFeatures_N,axis=0)
print("Features=BN, Class=R")
print(mean_Q_R)
print("Features=BN, Class=N")
print(mean_Q_N)
print("Features=IQIZ, Class=R")
print(mean_P_R)
print("Features=IQIZ, Class=N")
print(mean_P_N)

Features=BN, Class=R
[2.77162962 2.70476552 3.68423888 3.42797521 2.39332778 4.36899756
 3.6862858  3.51347216 2.73680122 2.42942592 3.47397681 2.99122293
 2.72413645]
Features=BN, Class=N
[2.98212268 2.88884591 4.23550324 3.53718177 2.1989232  4.30800458
 4.27954876 4.20733162 3.08629637 2.49876146 4.21626721 3.31846817
 3.56183687]
Features=IQIZ, Class=R
[ 79.32724741 168.02242085  12.35867154   0.78748283   0.6629929
  30.62998668 449.66925538  20.4576429    0.91754963   0.61243098]
Features=IQIZ, Class=N
[ 76.61158267 159.97559128  11.96003637   0.68312697   0.56079015
  30.68924129 401.67052684  18.86156023   0.78119817   1.11272882]


In [10]:
# Get four covariance matrices
# Get four mean vectors
cov_Q_R = np.cov(X_questionFeatures_R,rowvar=False)
cov_Q_N = np.cov(X_questionFeatures_N,rowvar=False)
cov_P_R = np.cov(X_physiolFeatures_R,rowvar=False)
cov_P_N = np.cov(X_physiolFeatures_N,rowvar=False)
print("Features=BN, Class=R")
print(cov_Q_R)
print("Features=BN, Class=N")
print(cov_Q_N)
print("Features=IQIZ, Class=R")
print(cov_P_R)
print("Features=IQIZ, Class=N")
print(cov_P_N)

Features=BN, Class=R
[[ 1.79355001 -0.98040806 -1.26485279  0.13415504  0.83840107 -0.21111466
  -1.12799672 -1.32056846 -0.79506226 -0.1642217  -1.52590787  0.23307538
   0.86425508]
 [-0.98040806  1.24438612  1.12579471 -0.4226461  -0.66674278  0.06464912
   1.0041908   1.15076669  0.10971841 -0.4987404   0.44255833 -0.43092002
  -0.21135111]
 [-1.26485279  1.12579471  1.4550986  -0.36456781 -0.9125852   0.19354764
   1.42248298  1.51454896  0.7523408   0.15892811  1.17408764 -0.54382239
  -0.62314824]
 [ 0.13415504 -0.4226461  -0.36456781  1.23959089  0.4785619   0.44330704
  -0.2108463  -0.23043295  0.20986097  0.22338362  0.70024025  1.20345417
   0.46990031]
 [ 0.83840107 -0.66674278 -0.9125852   0.4785619   0.79873281  0.10859196
  -0.85721632 -0.90691378 -0.5850454  -0.28303296 -0.77855001  0.65667014
   0.7353295 ]
 [-0.21111466  0.06464912  0.19354764  0.44330704  0.10859196  0.49307767
   0.30349457  0.30135852  0.21972785  0.0815126   0.50192056  0.52816827
   0.34035503]
 

In [11]:
print(mean_Q_R.shape)
print(cov_Q_R.shape)
print(X_questionFeatures_R.shape)
print(X_questionFeatures_R.shape[0])
norm.pdf(X_questionFeatures_R[0],mean_Q_R,cov_Q_R)
D_Q_R = norm(mean_Q_R,cov_Q_R)
print(D_Q_R.pdf(X_questionFeatures_R.shape[0]))

(13,)
(13, 13)
(14, 13)
14
[[6.86411527e-010             nan             nan 0.00000000e+000
  1.15054789e-042             nan             nan             nan
              nan             nan             nan 0.00000000e+000
  5.02373099e-038]
 [            nan 4.12059224e-019 2.07623603e-019             nan
              nan 0.00000000e+000 4.93114649e-024 3.22071810e-019
  0.00000000e+000             nan 1.30118695e-123             nan
              nan]
 [            nan 4.90508124e-023 3.34439353e-012             nan
              nan 0.00000000e+000 1.07766505e-012 1.02474945e-011
  1.13743118e-049 0.00000000e+000 1.19600684e-018             nan
              nan]
 [0.00000000e+000             nan             nan 5.16225813e-017
  1.55164705e-128 2.90175308e-103             nan             nan
  0.00000000e+000 0.00000000e+000 4.88359741e-050 2.23722405e-019
  7.77625679e-126]
 [5.36476402e-040             nan             nan 8.87498895e-107
  7.00840893e-047 0.00000000e+000      

In [21]:
def pdf_multivariate_gauss(x, mu, cov):
    part1 = 1 / ( ((2* np.pi)**(len(mu)/2)) * (np.linalg.det(cov)**(1/2)) )
    part2 = (-1/2) * ((x-mu).T.dot(np.linalg.inv(cov))).dot((x-mu))
    return float(part1 * np.exp(part2))
print(pdf_multivariate_gauss(X_questionFeatures_R.shape[0],mean_Q_R,cov_Q_R))

nan


  part1 = 1 / ( ((2* np.pi)**(len(mu)/2)) * (np.linalg.det(cov)**(1/2)) )
  return float(part1 * np.exp(part2))
