## Data Cleaning - framingham.csv

### Import libraries

In [10]:
import pandas as pd
df = pd.read_csv('GrpAssignment/framingham.csv')
num_nan_rows = df.isnull().sum(axis=1).astype(bool).sum()
total_rows = df.shape[0]
print('Number of records in total: {}\nNumber of records that has NaN values: {}'.format(total_rows, num_nan_rows))

Number of records in total: 4238
Number of records that has NaN values: 582


### Drop rows with missing values

In [11]:
df = df.dropna()
print('Number of records after removing NaN values: {}'.format(df.shape[0]))

Number of records after removing NaN values: 3656


### Dataset overview

In [12]:
df.head()
df.columns
for col in df.columns:
    print("Unique values in column '{}' are:".format(col))
    print(df[col].unique())

Unique values in column 'id' are:
[   0    1    2 ... 4233 4234 4237]
Unique values in column 'male' are:
[1 0]
Unique values in column 'age' are:
[39 46 48 61 43 63 45 52 50 41 38 42 44 47 35 60 36 59 54 37 53 49 65 51
 62 40 56 67 57 66 64 55 58 68 34 33 32 70 69]
Unique values in column 'education' are:
[4. 2. 1. 3.]
Unique values in column 'currentSmoker' are:
[0 1]
Unique values in column 'cigsPerDay' are:
[ 0. 20. 30. 23. 15. 10.  5. 35. 43.  1. 40.  3.  9.  2. 12.  4. 18. 60.
 25. 45.  8. 13. 11.  7.  6. 38. 50. 29. 17. 16. 19. 70. 14.]
Unique values in column 'BPMeds' are:
[0. 1.]
Unique values in column 'prevalentStroke' are:
[0 1]
Unique values in column 'prevalentHyp' are:
[0 1]
Unique values in column 'diabetes' are:
[0 1]
Unique values in column 'totChol' are:
[195. 250. 245. 225. 285. 228. 205. 313. 260. 254. 247. 294. 332. 221.
 232. 291. 190. 234. 215. 270. 272. 295. 226. 209. 214. 178. 233. 180.
 243. 237. 311. 208. 252. 261. 179. 267. 216. 240. 266. 255. 185. 220.
 23

### Normalize data

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
id = df.pop('id')
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_normalized = pd.concat([df_normalized, id.reset_index(drop=True)], axis=1)
last_col = df_normalized.pop(df_normalized.columns[-1])
df_normalized.insert(0, last_col.name, last_col)
df_normalized.head(20)

Unnamed: 0,id,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,1.0,0.184211,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168378,0.106383,0.232804,0.277024,0.363636,0.10452,0.0
1,1,0.0,0.368421,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.281314,0.177305,0.349206,0.31968,0.515152,0.101695,0.0
2,2,1.0,0.421053,0.0,1.0,0.285714,0.0,0.0,0.0,0.0,0.271047,0.208038,0.338624,0.237518,0.313131,0.084746,0.0
3,3,0.0,0.763158,0.666667,1.0,0.428571,0.0,0.0,1.0,0.0,0.229979,0.314421,0.497354,0.316045,0.212121,0.177966,1.0
4,4,0.0,0.368421,0.666667,1.0,0.328571,0.0,0.0,0.0,0.0,0.353183,0.219858,0.380952,0.183228,0.414141,0.127119,0.0
5,5,0.0,0.289474,0.333333,0.0,0.0,0.0,0.0,1.0,0.0,0.23614,0.456265,0.656085,0.357731,0.333333,0.166667,0.0
6,6,0.0,0.815789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188912,0.257683,0.243386,0.425836,0.161616,0.127119,1.0
7,7,0.0,0.342105,0.333333,1.0,0.285714,0.0,0.0,0.0,0.0,0.410678,0.078014,0.243386,0.148812,0.353535,0.107345,0.0
8,8,1.0,0.526316,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.301848,0.274232,0.433862,0.262239,0.323232,0.110169,0.0
9,9,1.0,0.289474,0.0,1.0,0.428571,0.0,0.0,1.0,0.0,0.229979,0.371158,0.624339,0.195589,0.494949,0.135593,0.0


### Write to new CSV

In [18]:
df_normalized.to_csv('GrpAssignment/cleaned_data_fram.csv', index=False)