In [53]:
import json
import requests
from nltk.corpus import stopwords # stopword examples, 'its', 'on', 'the', etc <---- will be helpful later
# most pythonistas will rename pandas as pd, numpy as np, and datetime as dt for short (you don't have to)
import pandas as pd
import numpy as np
import datetime as dt

In [54]:
path = 'stroke_data.csv'
stroke = pd.read_csv(path, converters={'age': pd.eval})

In [55]:
stroke

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,63.0,0,1,1,4,1,228.69,36.6,1,1
1,1.0,42.0,0,1,1,4,0,105.92,32.5,0,1
2,0.0,61.0,0,0,1,4,1,171.23,34.4,1,1
3,1.0,41.0,1,0,1,3,0,174.12,24.0,0,1
4,1.0,85.0,0,0,1,4,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
40905,1.0,38.0,0,0,0,4,1,120.94,29.7,1,0
40906,0.0,53.0,0,0,1,4,0,77.66,40.8,0,0
40907,1.0,32.0,0,0,1,2,0,231.95,33.2,0,0
40908,1.0,42.0,0,0,1,3,0,216.38,34.5,0,0


In [56]:
stroke.age

0        63.0
1        42.0
2        61.0
3        41.0
4        85.0
         ... 
40905    38.0
40906    53.0
40907    32.0
40908    42.0
40909    35.0
Name: age, Length: 40910, dtype: float64

In [57]:
stroke.smoking_status

0        1
1        0
2        1
3        0
4        1
        ..
40905    1
40906    0
40907    0
40908    0
40909    0
Name: smoking_status, Length: 40910, dtype: int64

In [58]:
stroke.age.unique()

array([ 63.,  42.,  61.,  41.,  85.,  55.,  82.,  17.,  31.,  39.,  26.,
        19.,  78.,  45.,  50.,  53.,  71.,  64.,  75.,  67.,  40.,  23.,
        33.,  65.,  83.,  79.,  62.,  58.,  34.,  27.,  68.,  52.,  49.,
        13.,  60.,  24.,  11.,  47.,  70.,  38.,  59.,  36.,  77.,  54.,
        93.,  29.,  51.,  21.,  56.,  20.,  46.,  72.,  37.,  18.,  66.,
        25.,  57.,  16.,  80.,  28.,  84.,  44.,  43.,  76.,  48.,   8.,
        14.,  88.,  69.,  22.,  15.,  32.,  35.,  73.,  81.,  74.,  30.,
        86.,  92.,   5.,  91.,   6.,  87.,   7.,   9.,  12.,  10.,  89.,
        90.,   4.,   1.,   3.,  -2.,  -5.,  -4.,  -3.,  -1.,   2.,   0.,
        -6.,  -9.,  95., 103.,  94.,  98.,  96., 102., 101.,  97.,  99.,
       100.])

In [59]:
len(stroke.age.unique())

111

In [60]:
1/111

0.009009009009009009

In [61]:
stroke.avg_glucose_level

0        228.69
1        105.92
2        171.23
3        174.12
4        186.21
          ...  
40905    120.94
40906     77.66
40907    231.95
40908    216.38
40909     95.01
Name: avg_glucose_level, Length: 40910, dtype: float64

In [62]:
stroke.avg_glucose_level.unique()

array([228.69, 105.92, 171.23, ..., 125.2 ,  82.99, 166.29])

In [63]:
len(stroke.avg_glucose_level.unique())

2903

In [64]:
1/2903

0.0003444712366517396

In [69]:
def create_features(age, smoker, married):
    
    return{'smoker_status': smoker, 'age': age, 'ever_married': married}

In [70]:
features_and_labels = zip(stroke.age, stroke.smoking_status, stroke.stroke, stroke.ever_married)
features_and_labels

<zip at 0x1e03e5ab740>

In [71]:
feature_sets = [ (create_features(age,smoker,married), stroke) for age, smoker,married, stroke in features_and_labels]
feature_sets

[({'smoker_status': 1, 'age': 63.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 42.0, 'ever_married': 1}, 1),
 ({'smoker_status': 1, 'age': 61.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 41.0, 'ever_married': 1}, 1),
 ({'smoker_status': 1, 'age': 85.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 55.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 82.0, 'ever_married': 1}, 0),
 ({'smoker_status': 0, 'age': 17.0, 'ever_married': 1}, 1),
 ({'smoker_status': 1, 'age': 31.0, 'ever_married': 1}, 1),
 ({'smoker_status': 1, 'age': 55.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 39.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 26.0, 'ever_married': 1}, 1),
 ({'smoker_status': 1, 'age': 19.0, 'ever_married': 1}, 1),
 ({'smoker_status': 1, 'age': 78.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 45.0, 'ever_married': 1}, 0),
 ({'smoker_status': 1, 'age': 50.0, 'ever_married': 1}, 1),
 ({'smoker_status': 0, 'age': 53.0, 'eve

In [72]:
import math
import nltk

print(len(feature_sets))
split_num = math.floor(len(feature_sets)*.8)
print(split_num)

import random
random.shuffle(feature_sets)

40910
32728


In [73]:
training_set = feature_sets[:split_num]
testing_set = feature_sets[split_num:]

In [74]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [82]:
print(nltk.classify.accuracy(classifier, testing_set))

0.7894801253181907


In [83]:
len(feature_sets)

40910

In [84]:
68* .8

54.400000000000006

In [85]:
training_set = feature_sets[:54]
testing_set = feature_sets[54:]

In [86]:
import nltk
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [88]:
classifier.classify(create_features(63,1,1))

1

In [89]:
print(nltk.classify.accuracy(classifier, testing_set))

0.7894801253181907
