In [1]:
from __future__ import print_function
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import csv

In [2]:
vect = CountVectorizer()

data_feature = []
data_class = []
with open('train.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data_feature.append(row['Descript'])
        data_class.append(row['Category'])
    csvfile.close()

In [4]:
len(data_class)

90490

In [5]:
data_feature[:10]

['.8 MM DIA VERT  .5 MM DIA HORZ Choroidal nevus',
 '() Chronic Central Serous Chorioretinopathy',
 '() Neovascular AMD',
 '(2) Atrophic Retinal Holes',
 '(2) Horseshoe Tears  s/p laser 2002',
 '(2) Treated Retinal Tears  s/p Barrier Laser on 12/23/2014',
 '(2) Treated Retinal Tears  s/p laser demarcation #2 on 2/2/2018',
 '(2nd) Retinal Detachment',
 '(3) Treated horseshoe retinal tears  s/p Laser demarcation 10/5/2017',
 '(3) Treated Retinal Tears  each associated with a localized retinal detachment  s/p laser #2 on 01/28/2016']

In [6]:
data_class[:10]

['D31.32',
 'H35.712',
 'H35.32',
 'H33.323',
 'H33.311',
 'H33.301',
 'H33.301',
 'H33.001',
 'H33.311',
 'H33.301']

In [7]:
df = pd.DataFrame({
    'label' : data_class,
    'message' : data_feature
})

In [8]:
df.head(10)

Unnamed: 0,label,message
0,D31.32,.8 MM DIA VERT .5 MM DIA HORZ Choroidal nevus
1,H35.712,() Chronic Central Serous Chorioretinopathy
2,H35.32,() Neovascular AMD
3,H33.323,(2) Atrophic Retinal Holes
4,H33.311,(2) Horseshoe Tears s/p laser 2002
5,H33.301,(2) Treated Retinal Tears s/p Barrier Laser o...
6,H33.301,(2) Treated Retinal Tears s/p laser demarcati...
7,H33.001,(2nd) Retinal Detachment
8,H33.311,(3) Treated horseshoe retinal tears s/p Laser...
9,H33.301,(3) Treated Retinal Tears each associated wit...


In [9]:
df.label.value_counts()

H35.32                                 5468
H33.001                                2178
H35.3211                               2155
H33.002                                2098
H35.3221                               1961
E11.351                                1375
H33.301                                1235
H33.302                                1185
H35.81                                 1160
H35.3231                               1102
H35.371                                1090
H35.372                                1009
E11.3513  Z79.4                         961
H35.3212                                956
H35.3222                                918
E11.351  Z79.4                          886
H43.12                                  882
H35.31                                  877
H43.11                                  846
H35.3232                                804
H33.321                                 801
H35.351                                 792
H35.352                         

In [10]:
import collections
from collections import Counter

In [11]:
vocab = Counter(data_class).most_common()

In [12]:
import numpy as np
# print(vocab[0])
vocab = np.array([word for word, _ in vocab])

In [13]:
df['label_num']=df.label.map({
    word:code for code, word in enumerate(vocab)
})

In [14]:
df.head(10)

Unnamed: 0,label,message,label_num
0,D31.32,.8 MM DIA VERT .5 MM DIA HORZ Choroidal nevus,48
1,H35.712,() Chronic Central Serous Chorioretinopathy,59
2,H35.32,() Neovascular AMD,0
3,H33.323,(2) Atrophic Retinal Holes,115
4,H33.311,(2) Horseshoe Tears s/p laser 2002,29
5,H33.301,(2) Treated Retinal Tears s/p Barrier Laser o...,6
6,H33.301,(2) Treated Retinal Tears s/p laser demarcati...,6
7,H33.001,(2nd) Retinal Detachment,1
8,H33.311,(3) Treated horseshoe retinal tears s/p Laser...,29
9,H33.301,(3) Treated Retinal Tears each associated wit...,6


In [15]:
# Scikit Learn
X = df.message
y = df.label_num
print(X.shape)
print(y.shape)

(90490,)
(90490,)


In [16]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(67867,)
(22623,)
(67867,)
(22623,)




In [17]:
X_train_dtm = vect.fit_transform(X_train)

X_train_dtm

<67867x7298 sparse matrix of type '<class 'numpy.int64'>'
	with 355568 stored elements in Compressed Sparse Row format>

In [18]:
X_test_dtm = vect.transform(X_test)

X_test_dtm

<22623x7298 sparse matrix of type '<class 'numpy.int64'>'
	with 117516 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [20]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 1.8 s, sys: 844 ms, total: 2.64 s
Wall time: 2.74 s


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
y_pred_class = nb.predict(X_test_dtm)

In [22]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.2903240065420148