In [1]:
from __future__ import print_function, division
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import os.path as osp
%matplotlib inline

In [2]:
# get 
data_dir = osp.join(osp.dirname(osp.dirname(osp.abspath('.'))), 'data')
data_file = osp.join(data_dir, 'phenotypes.aug.csv')

In [3]:
df = pd.read_csv(data_file)

# change the names of the variables ! 
new2old_names = {
             'X':'X',
             'subj_id':'SubjID',
             'age':'Age_At_IMGExam', 
             'site':'ManufacturersModelName', 
             'gender':'Gender', 
             'hand':'FDH_23_Handedness_Prtcpnt',
             'l_hippo': 'MRI_cort_vol.ctx.lh.parahippocampal',
             'anxiety':'PHX_ANX_TOTAL',
             'mystery':'MYSTERY'}

old2new_names = {val:key for (key,val) in new2old_names.items()}
df = df.rename(columns=old2new_names)


In [4]:
# Which variables have null values ?
print('df.isnull().any():',df.isnull().any())

df.isnull().any(): X          False
subj_id    False
site       False
l_hippo    False
gender     False
age         True
hand        True
anxiety     True
mystery    False
dtype: bool


In [5]:
reduce_noise_by = 4

to_regress_name = 'l_hippo'
regressors_names = ['age','site','gender','mystery']
df = df[regressors_names + [to_regress_name]]

# initialize a new variable
new_variable = df['age']
mask = ~np.isnan( np.asarray(new_variable) )

# make it a bit different
new_variable = -new_variable + np.random.normal(0, new_variable.std()/reduce_noise_by, 
                                                        size=new_variable.shape)
# add it to the dataframe
df['new_variable'] = new_variable

# some simple descriptive statistics
print("std of age, std of new variable", np.std(df['age'][mask]), 
                                          np.std(new_variable[mask]))
print("mean and std of the new variable", np.mean(new_variable), np.std(new_variable))
print("we have correlated variables:", 
          np.corrcoef(np.asarray(new_variable[mask]), np.asarray(df['age'][mask]))[0,1])

std of age, std of new variable 4.93992842336 5.178580209
mean and std of the new variable -12.9107853065 5.178580209
we have correlated variables: -0.967808388824


In [6]:
# drop the non a numerical values 
# see https://chrisalbon.com/python/data_wrangling/pandas_missing_data/

df = df.dropna()
print("dataset size and columns", df.shape, '\n', df.keys())

dataset size and columns (997, 6) 
 Index(['age', 'site', 'gender', 'mystery', 'l_hippo', 'new_variable'], dtype='object')


In [7]:
import patsy as pat
import statsmodels.formula.api as smf

In [8]:
to_regress_name = 'l_hippo'
regressors_names = ['age','site','gender', 'new_variable', 'mystery']
all_variables = [to_regress_name] + regressors_names
#regressors_vars = [df[names['age']], df[names['site']], df[names['gender']]]
#for k in df.keys() if k not in {'X', to_regress,  'SubjID', 'ManufacturersModelName'}]

In [9]:
rows_len = df.shape[0]
sample_n = 50
assert sample_n <= rows_len
# sample 50 subjects 

rows = np.random.randint(0,rows_len, size=(sample_n,))
rows.sort()
# print(rows)
df_reduced = df.iloc[rows]
df_reduced.shape

(50, 6)

In [10]:
formula = to_regress_name + ' ~ \n\t' + '+ \n\t'.join(regressors_names)
print(formula)

l_hippo ~ 
	age+ 
	site+ 
	gender+ 
	new_variable+ 
	mystery


In [11]:
#pat.dmatrices??

In [12]:
mod = smf.ols(formula=formula, data=df_reduced)
#y, X = pat.dmatrices(formula, data=df, return_type='matrix')

In [13]:
res = mod.fit()

In [14]:
# print(res.summary());

In [15]:
res.pvalues['age']

0.3751123446060034

In [16]:
to_regress_name = 'l_hippo'
regressors_names = ['age','site','gender']
all_variables = [to_regress_name] + regressors_names

formula = to_regress_name + ' ~ \n\t' + '+ \n\t'.join(regressors_names)
print(formula)

df_reduced = df_reduced[all_variables]
mod = smf.ols(formula=formula, data=df_reduced)
res = mod.fit()
res.pvalues['age']

l_hippo ~ 
	age+ 
	site+ 
	gender


0.091036539845115394