In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn import linear_model
from sklearn import decomposition

The file explain is a rtf file explaining which variables are numeric and which are categorical. There are 1765 variables, the first being ID and it's part in the explain file does not contain the target string 'the SPSS measurement level is'. The others' part contains either NOMINAL, ORDINAL or SCALE. Only SCALE is numeric.

column 'n503930' is epilepsy status

In [2]:
# file path of explain file
explain_path = '/Users/hung-yiwu/Documents/AC209a_project/data/ncds_sweep5/mrdoc/allissue/ncds5cmi_ukda_data_dictionary.rtf'

# open explain file
explain = open(explain_path, 'r')

# read the whole file as a single giant string
explain_text = explain.read()

# target string
target_string = 'the SPSS measurement level is'

# locate target string in the whole file
target_loc = [m.start() for m in re.finditer(target_string, explain_text)]

In [35]:
target_string_2 = 'Variable label ='
target_loc_2 = [m.end() for m in re.finditer(target_string_2, explain_text)]

var_name_list = []

for loc in target_loc_2:
    start_point = loc+8
    end_point = explain_text.find('\par', loc)
    var_name_list.append(explain_text[start_point:end_point])

print var_name_list[0:10]

['ncdsid serial number', 'PERSON NUMBER', 'Sex of Cohort Member', 'Standard region at NCDS5', 'NCDS5 Government Office Region', 'Responded to Cohort Member Interview', 'Responded to What Do You Think Questionnaire', 'Responded to Your Life Since 1974 Questionnaire', 'Time of Interview (mins)', 'CMI:2,A1a) Current main economic activity']


In [3]:
# file path of data file
data_path = './ncds_sweep5/tab/ncds5cmi.tab'

# read data file into Pandas DataFrame
# delimiter is tab
# use column 'ncdsid' as index
data = pd.read_csv(data_path, delimiter='\t').set_index('ncdsid')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# identify categorical variables by target strings in explain file
cat_col = []

for index, loc in enumerate(target_loc):
    # get feature character
    char = explain_text[loc+len(target_string)+10]
    
    if data.columns[index] == 'n503930':
        # response variable
        continue
    elif char == 'O' or char == 'o' or char == 'N' or char == 'n'\
        or 'region' in data.columns[index]:
        # it is SPSS data type ORDINAL or NOMINAL
        # it is categorical data
        cat_col.append(index)

print len(cat_col), 'categorical predictor variables'
print len(data.columns)-1-len(cat_col), 'numerical predictor variables'
print len(data.columns)-1, 'total predictor variables'

# calculate number of variables after dummy expansion
col_len = np.zeros(data.shape[1])

for index, col in enumerate( data.columns ):
    if index in cat_col:
        # categorical variable
        # require expansion
        col_len[index] = len( data[col].unique() )
    else:
        # numerical variable
        # does not require expansion
        col_len[index] = 1

print int( sum(col_len) )-1, 'predictor variables after dummy expansion'
print data.shape[0], 'observations'

2745 categorical predictor variables
442 numerical predictor variables
3187 total predictor variables
79478 predictor variables after dummy expansion
11469 observations


In [7]:
new_columns = []
sub_df_list = []

for index, col in enumerate(data.columns):
    if index in cat_col:
        # categorical variable
        # dummify
        dummies = pd.get_dummies(data[col])
        # append value
        sub_df_list.append( dummies )
        # append column name
        new_columns.extend([str(col)+'='+str(value) for value in data[col].unique()])
    else:
        # numeric variable
        # append value
        try:
            sub_df_list.append( data[col].apply(lambda x: 0 if x == ' ' else float(x)) )
        except ValueError:
            print 'value error'
        # append column name
        new_columns.append(col)
        
data_expanded = pd.concat( sub_df_list, axis=1 )
data_expanded.columns = new_columns

value error
value error
value error
value error
value error
value error
value error
value error
value error
value error
value error
value error
value error
value error
value error
value error
value error


ValueError: Length mismatch: Expected axis has 79462 elements, new values have 79479 elements

Pos. = 772	Variable = n5765	Variable label = 4I Fits,convulsions-ever called epileptic

This variable is  numeric, the SPSS measurement level is ordinal.

	Value label information for n5765
	Value = 1	Label = YES
	Value = 2	Label = NO
	Value = 8	Label = DONT   KNOW

In [None]:
# separate predictor and response variables
y = data_expanded['n503930'].values
x = data_expanded.drop('n503930', axis=1).values

In [None]:
# binary question: epilepsy or not
y_t = np.array( [1 if value == 1 else 0 for value in y] )

In [None]:
# reduce dimension by pca
# already did a full decomposition and decided to use first 4 principal components
# decomposition takes time so did not do it twice to show everything
pca_model = decomposition.PCA( n_components = 4)
x_t = pca_model.fit_transform(x)

In [None]:
# visualize principal components contribution
plt.plot(np.cumsum(pca_model.explained_variance_ratio_))
plt.xlabel('number of principal components used')
plt.ylabel('cumulative explained variance ratio')
plt.title('first 4 principal components should suffice')
plt.show()

In [None]:
# classify seizure status by multi-class logistic regression with CV
model = linear_model.LogisticRegressionCV()
model.fit(x_t, y_t)
rsq = model.score(x_t, y_t)

In [None]:
coef = model.coef_
pc = pca_model.components_
importance = np.sum( coef.reshape(4,-1)*pc, axis = 0 )
plt.plot( importance )
plt.xlabel('predictor variables')
plt.ylabel('coefficient')
plt.title('some predictors are more important than others')
plt.show()

In [None]:
# get variable list
var_list = data_expanded.drop('n503930', axis=1).columns

# get top three positive coef
top_pos_ind = np.argsort(importance)[-5:][:-1]
top_pos = list( var_list[top_pos_ind] )
print 'top positive variables'
print top_pos
print ''

# get top three negative coef
top_neg_ind = np.argsort(importance)[0:2]
top_neg = list( var_list[top_neg_ind] )
print 'top negative variables'
print top_neg