### Prudhviraj Tirumanisetti
EE 258
ID:011489881

In [None]:
# Preliminaries

# Show plots in the notebook
%matplotlib inline

# To start we import some prerequisites
from sklearn import datasets, preprocessing, feature_selection
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib2

# Let's load the data
heart_data = urllib2.urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data")
heart = pd.read_csv(heart_data, quotechar='"', skipinitialspace=True, names=['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBloodSugar', 'RestingECG', 'MaxHeartRate', 'ExerciseInducedAngina', 'STExerciseDepression', 'STExercisePeakSlope', 'FlouroscopyVessels', 'Thalassemia', 'HeartDisease'], na_values="?")

# To make things a bit more difficult, let's break the dataset even more
# We'll replace a nominal attribute with an unknown value
heart.loc[::10, 'Sex']=2
# And put some inconsistent values for blood pressure
heart.loc[::7, 'RestingBP']=-100

In [None]:
print heart.shape

In [None]:
print heart.isnull().sum()

In [None]:
print len(heart[(heart['Sex'] > 1) | (heart['Sex'] < 0)])


In [None]:
heart.loc[(heart['Sex'] > 1) | (heart['Sex'] < 0), 'Sex'] = np.nan

# As a sanity check, let's make sure those values were updated
print len(heart[(heart['Sex'] > 1) | (heart['Sex'] < 0)])

In [None]:
# Now let's remove these and any other missing values from the dataset using dropna() function
heart_cleaned = heart.dropna()
print heart_cleaned.shape

In [None]:
# Question 1
# Part 1 Replace negative RestingBP values with NaN
print len(heart_cleaned[heart_cleaned['RestingBP'] < 0])
heart_cleaned.loc[heart_cleaned['RestingBP'] < 0, 'RestingBP'] = np.nan
print len(heart_cleaned[heart_cleaned['RestingBP'] < 0])

# Part 2 a) Replace the missing values with the mean
# print heart_cleaned.mean() # 131.34
# print heart_cleaned.describe()
# heart_cleaned.fillna(heart_cleaned.mean(), inplace=True)
# print heart_cleaned.describe()
# #              Age         Sex  ChestPainType   RestingBP  Cholesterol  \
# count  266.000000  266.000000     266.000000  266.000000   266.000000   
# mean    54.571429    0.680451       3.169173  131.340611   248.304511   
# std      9.033093    0.467181       0.954408   16.064426    52.845318   
# min     29.000000    0.000000       1.000000   94.000000   126.000000   
# 25%     48.000000    0.000000       3.000000  120.000000   212.000000   
# 50%     56.000000    1.000000       3.000000  130.670306   243.000000   
# 75%     61.000000    1.000000       4.000000  140.000000   277.750000   
# max     77.000000    1.000000       4.000000  192.000000   564.000000   

#        FastingBloodSugar  RestingECG  MaxHeartRate  ExerciseInducedAngina  \
# count         266.000000  266.000000    266.000000             266.000000   
# mean            0.146617    1.000000    149.458647               0.319549   
# std             0.354390    0.994324     23.517159               0.467181   
# min             0.000000    0.000000     71.000000               0.000000   
# 25%             0.000000    0.000000    132.000000               0.000000   
# 50%             0.000000    1.000000    154.000000               0.000000   
# 75%             0.000000    2.000000    166.000000               1.000000   
# max             1.000000    2.000000    202.000000               1.000000   

#        STExerciseDepression  STExercisePeakSlope  FlouroscopyVessels  \
# count            266.000000           266.000000          266.000000   
# mean               1.075188             1.601504            0.706767   
# std                1.187126             0.625735            0.953895   
# min                0.000000             1.000000            0.000000   
# 25%                0.000000             1.000000            0.000000   
# 50%                0.800000             2.000000            0.000000   
# 75%                1.750000             2.000000            1.000000   
# max                6.200000             3.000000            3.000000   

#        Thalassemia  HeartDisease  
# count   266.000000    266.000000  
# mean      4.703008      0.969925  
# std       1.938233      1.234350  
# min       3.000000      0.000000  
# 25%       3.000000      0.000000  
# 50%       3.000000      0.000000  
# 75%       7.000000      2.000000  
# max       7.000000      4.000000  
# The effect is: 
# mean - stayed the same at 131.34
# median - increased from 130 -> 130.670306
# std - decreased from 17.318917 -> 16.064426



# Part 2 b) Replace the missing values with the median
# # print heart_cleaned.median() # 130
# print heart_cleaned.describe()
# heart_cleaned.fillna(heart_cleaned.median(), inplace=True)
# print heart_cleaned.describe()
#              Age         Sex  ChestPainType   RestingBP  Cholesterol  \
# count  266.000000  266.000000     266.000000  266.000000   266.000000   
# mean    54.571429    0.680451       3.169173  131.154135   248.304511   
# std      9.033093    0.467181       0.954408   16.071148    52.845318   
# min     29.000000    0.000000       1.000000   94.000000   126.000000   
# 25%     48.000000    0.000000       3.000000  120.000000   212.000000   
# 50%     56.000000    1.000000       3.000000  130.000000   243.000000   
# 75%     61.000000    1.000000       4.000000  140.000000   277.750000   
# max     77.000000    1.000000       4.000000  192.000000   564.000000   

#        FastingBloodSugar  RestingECG  MaxHeartRate  ExerciseInducedAngina  \
# count         266.000000  266.000000    266.000000             266.000000   
# mean            0.146617    1.000000    149.458647               0.319549   
# std             0.354390    0.994324     23.517159               0.467181   
# min             0.000000    0.000000     71.000000               0.000000   
# 25%             0.000000    0.000000    132.000000               0.000000   
# 50%             0.000000    1.000000    154.000000               0.000000   
# 75%             0.000000    2.000000    166.000000               1.000000   
# max             1.000000    2.000000    202.000000               1.000000   

#        STExerciseDepression  STExercisePeakSlope  FlouroscopyVessels  \
# count            266.000000           266.000000          266.000000   
# mean               1.075188             1.601504            0.706767   
# std                1.187126             0.625735            0.953895   
# min                0.000000             1.000000            0.000000   
# 25%                0.000000             1.000000            0.000000   
# 50%                0.800000             2.000000            0.000000   
# 75%                1.750000             2.000000            1.000000   
# max                6.200000             3.000000            3.000000   

#        Thalassemia  HeartDisease  
# count   266.000000    266.000000  
# mean      4.703008      0.969925  
# std       1.938233      1.234350  
# min       3.000000      0.000000  
# 25%       3.000000      0.000000  
# 50%       3.000000      0.000000  
# 75%       7.000000      2.000000  
# max       7.000000      4.000000  
# The effect is: 
# mean - decreased from 131.340611 -> 131.154135
# median - stayed the same at 130 
# std - decreased from 17.318917 -> 16.071148



# Part 3 
# a) Replace the missing values with means computed for each sex
heart_cleaned_female = heart_cleaned[heart_cleaned.Sex == 0]
heart_cleaned_male = heart_cleaned[heart_cleaned.Sex == 1]

# # print heart_cleaned_female.mean() # 132.2
# # print heart_cleaned_male.mean() # 130.96 ~ 131
# print heart_cleaned.describe()
# heart_cleaned.loc[(heart_cleaned['Sex'] == 0) & (heart_cleaned['RestingBP'].isnull()), 'RestingBP'] = 132.2
# heart_cleaned.loc[(heart_cleaned['Sex'] == 1) & (heart_cleaned['RestingBP'].isnull()), 'RestingBP'] = 131
# print heart_cleaned.describe()
# Descriptive stats: 
#          RestingBP  
# count  266.000000     
# mean   131.360902     
# std    16.066014      
# min    94.000000      
# 25%    120.000000      
# 50%    130.500000    
# 75%    140.000000    
# max    192.000000 

# # The effect is: 
# # mean - increased 131.34 -> 131.36
# # median - increased from 130 -> 130.5
# # std - decreased from 17.32 -> 16.1


# b) Replace the missing values with means/medians computed for each sex
# # print heart_cleaned_female.median() # 130
# # print heart_cleaned_male.median() # 130
# print heart_cleaned.describe()
heart_cleaned.loc[(heart_cleaned['Sex'] == 0) & (heart_cleaned['RestingBP'].isnull()), 'RestingBP'] = 130
heart_cleaned.loc[(heart_cleaned['Sex'] == 1) & (heart_cleaned['RestingBP'].isnull()), 'RestingBP'] = 130
print heart_cleaned.describe()
# Descriptive stats:
#         RestingBP
# count   266.000000      
# mean    131.154135      
# std     16.071148   
# min     94.000000 
# 25%     120.000000   
# 50%     130.000000   
# 75%     140.000000
# max     192.000000

# # The effect is: 
# # mean - decreased from 131.34 -> 131.15
# # median - stayed the same 130 -> 130 
# # std - decreased from 17.32 -> 16.07



In [None]:
# Question 1 code
#     part 1 Replace negative RestingBP values with NaN
#         print len(heart_cleaned[heart_cleaned['RestingBP'] < 0])
#         heart_cleaned.loc[heart_cleaned['RestingBP'] < 0, 'RestingBP'] = np.nan
#         print len(heart_cleaned[heart_cleaned['RestingBP'] < 0])

    # part 2 
#         a) Replace the missing values with the mean
#             print heart_cleaned.describe()
#             heart_cleaned.fillna(heart_cleaned.mean(), inplace=True)
#             print heart_cleaned.describe()

#         b) Replace the missing values with the median  
#             print heart_cleaned.median() # 130
#             print heart_cleaned.describe()
#             heart_cleaned.fillna(heart_cleaned.median(), inplace=True)
#             print heart_cleaned.describe()

    # part 3 
            # a) Replace the missing values with means computed for each sex
    
#             heart_cleaned_female = heart_cleaned[heart_cleaned.Sex == 0]
#             heart_cleaned_male = heart_cleaned[heart_cleaned.Sex == 1]
            # # print heart_cleaned_female.mean() # 132.2
            # # print heart_cleaned_male.mean() # 130.96 ~ 131
            # print heart_cleaned.describe()
            # heart_cleaned.loc[(heart_cleaned['Sex'] == 0) & (heart_cleaned['RestingBP'].isnull()), 'RestingBP'] = 132.2
            # heart_cleaned.loc[(heart_cleaned['Sex'] == 1) & (heart_cleaned['RestingBP'].isnull()), 'RestingBP'] = 131
            # print heart_cleaned.describe()
            
            # b) Replace the missing values with medians computed for each sex 
    

In [None]:
# Let's re-load a clean dataset, so our results don't depent on your answer to Q1
heart_data = urllib2.urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data")
heart = pd.read_csv(heart_data, quotechar='"', skipinitialspace=True, names=['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBloodSugar', 'RestingECG', 'MaxHeartRate', 'ExerciseInducedAngina', 'STExerciseDepression', 'STExercisePeakSlope', 'FlouroscopyVessels', 'Thalassemia', 'HeartDisease'], na_values="?")
heart=heart.dropna()

# Not let's look at a snippet of the original data
heart_ratio = heart.loc[:,['Age','RestingBP','Cholesterol','MaxHeartRate','STExercisePeakSlope','FlouroscopyVessels']]
print heart_ratio.head()

# The MinMaxScaler scales each value by subtracting the minimum and then dividing by the range
# or scaled_value = (value - min) / (max-min)
scaler = preprocessing.MinMaxScaler()

# Scale the heart data
heart_ratio_scaled_values = scaler.fit_transform(heart_ratio.values)

# Put the results back into a dataframe
heart_ratio_scaled = pd.DataFrame(heart_ratio_scaled_values, columns = heart_ratio.columns)
print heart_ratio_scaled.head()

# heart_ratio_scaled = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(heart_ratio.values), columns = heart_ratio.columns)


In [None]:
# Remove the label from the features
heart_unlabeled = heart[['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBloodSugar', 'RestingECG', 'MaxHeartRate', 'ExerciseInducedAngina', 'STExerciseDepression', 'STExercisePeakSlope', 'FlouroscopyVessels', 'Thalassemia']]

# Scale the values, then generate polynomial features
heart_polynomial = pd.DataFrame(preprocessing.PolynomialFeatures().fit_transform(preprocessing.MinMaxScaler().fit_transform(heart_unlabeled.values)))
heart_polynomial_values = heart_polynomial.values
print heart_polynomial_values.shape
print heart_unlabeled.shape


In [None]:
# Question 2
# 1) Apply the StandardScaler 
better_scaler = preprocessing.StandardScaler()
heart_ratio_better_scaler_values = better_scaler.fit_transform(heart_ratio.values)
heart_ratio_better_scaled = pd.DataFrame(heart_ratio_better_scaler_values, columns = heart_ratio.columns)
print heart_ratio_better_scaled.describe()
#                 Age     RestingBP   Cholesterol  MaxHeartRate  \
# count  2.970000e+02  2.970000e+02  2.970000e+02  2.970000e+02   
# mean  -1.237319e-16  4.810966e-16 -1.911116e-16  5.143660e-16   
# std    1.001688e+00  1.001688e+00  1.001688e+00  1.001688e+00   
# min   -2.827176e+00 -2.125634e+00 -2.337704e+00 -3.431849e+00   
# 25%   -7.241238e-01 -6.594306e-01 -7.002541e-01 -7.247694e-01   
# 50%    1.613719e-01 -9.550637e-02 -8.380217e-02  1.484822e-01   
# 75%    7.148067e-01  4.684179e-01  5.519138e-01  7.160957e-01   
# max    2.485798e+00  3.851964e+00  6.099981e+00  2.287949e+00   

#        STExercisePeakSlope  FlouroscopyVessels  
# count         2.970000e+02        2.970000e+02  
# mean         -1.278439e-16        6.653862e-17  
# std           1.001688e+00        1.001688e+00  
# min          -9.765832e-01       -7.219761e-01  
# 25%          -9.765832e-01       -7.219761e-01  
# 50%           6.437811e-01       -7.219761e-01  
# 75%           6.437811e-01        3.448244e-01  
# max           2.264145e+00        2.478425e+00  

# Mean and Std for each column
#                Age     RestingBP   Cholesterol  MaxHeartRate   STExercisePeakSlope  FlouroscopyVessels
# mean  -1.237319e-16  4.810966e-16 -1.911116e-16  5.143660e-16   -1.278439e-16       6.653862e-17  
# std    1.001688e+00  1.001688e+00  1.001688e+00  1.001688e+00   1.001688e+00        1.001688e+00  

# 2) How many polynomial features are generated? 
# There are 105 polynomial features created
# The number is generated using the formula
#    f(n) = 1 + 2n + nChoose2
# where n is the number of features
# the 1 is there by default. 
# The 2n comes from the fact that each feature including it's root power is included ex [1, a, b, a^2, ab, b^2].
# and the last term is all the possible combinations between the n features

In [None]:
# Question 2 code
    # Part 1
#     better_scaler = preprocessing.StandardScaler()
#     heart_ratio_better_scaler_values = better_scaler.fit_transform(heart_ratio.values)
#     heart_ratio_better_scaled = pd.DataFrame(heart_ratio_better_scaler_values, columns = heart_ratio.columns)
#     print heart_ratio_better_scaled.describe()

In [None]:
# Test code to see the patterns for PolynomialFeature generation

# nums = [0,1,2,3,4]
# df2 = pd.DataFrame( 1, index=nums, columns=list('ABC'))
# print df2
# df_polynomial = pd.DataFrame(preprocessing.PolynomialFeatures().fit_transform(preprocessing.MinMaxScaler().fit_transform(df2.values)))
# df_polynomial_vals = df_polynomial.values
# print df_polynomial_vals

In [1]:
# Part 3 
# Question 1
# Apply the VarianceThreshold
from sklearn.feature_selection import VarianceThreshold
heart_polynomial_prunned = VarianceThreshold(threshold=(.2 * (1 - .2))).fit_transform(heart_polynomial)
print heart_polynomial_prunned.shape
# There remain 17 features

NameError: name 'heart_polynomial' is not defined

In [None]:
# load some modules to help
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

# extract labels
heart_labels = heart['HeartDisease'].values
heart_labels[heart_labels > 0] = 1

# perform PCA 
heart_polynomial_pca = PCA(n_components=3).fit_transform(heart_polynomial_values)

# Plot
ax = Axes3D(plt.figure(), elev=-150, azim=200)
ax.scatter(heart_polynomial_pca[:,0], heart_polynomial_pca[:,1], heart_polynomial_pca[:,2], c=heart_labels)


In [None]:
# Part 4
# Question 1
# a) Does the projection do a better job of separating the points?
    # A: no the poits are much closely packed together
    
# b) Does the PCA projection produce a clearer delineation of the patients with normal heart function
# and those with heart disease?
    # A: no, the PCA projection of the heart_polynomial data produced a much clearer delineation of the patients
    # with normal heart function and those with the disease

# perform PCA 
heart_unlabeled_pca = PCA(n_components=3).fit_transform(heart_unlabeled.values)

# Plot
ax = Axes3D(plt.figure(), elev=-150, azim=200)
ax.scatter(heart_unlabeled_pca[:,0], heart_unlabeled_pca[:,1], heart_unlabeled_pca[:,2], c=heart_labels)
