In [1]:
import pandas as pd

In [2]:
# import regular expression module 're'
import re

# Import NLTK. NLTK is a standard python library that provides a set of diverse algorithms for NLP.
import nltk
from nltk.corpus import stopwords

# Lematization 
from nltk.stem import WordNetLemmatizer

# Tokenization
from nltk.tokenize import sent_tokenize, word_tokenize

# Feature extraction using TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

import statsmodels.api as smf

from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

In [17]:
data=pd.read_csv("mtsamples.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [4]:
data.shape

(4999, 6)

# There are 4999 observations & 6 features

In [5]:
data.columns

Index(['Unnamed: 0', 'description', 'medical_specialty', 'sample_name',
       'transcription', 'keywords'],
      dtype='object')

In [6]:
data.medical_specialty.value_counts().shape

(40,)

In [7]:
data.medical_specialty.value_counts()

 Surgery                          1103
 Consult - History and Phy.        516
 Cardiovascular / Pulmonary        372
 Orthopedic                        355
 Radiology                         273
 General Medicine                  259
 Gastroenterology                  230
 Neurology                         223
 SOAP / Chart / Progress Notes     166
 Obstetrics / Gynecology           160
 Urology                           158
 Discharge Summary                 108
 ENT - Otolaryngology               98
 Neurosurgery                       94
 Hematology - Oncology              90
 Ophthalmology                      83
 Nephrology                         81
 Emergency Room Reports             75
 Pediatrics - Neonatal              70
 Pain Management                    62
 Psychiatry / Psychology            53
 Office Notes                       51
 Podiatry                           47
 Dermatology                        29
 Cosmetic / Plastic Surgery         27
 Dentistry               

# There are 40 different classes of medical speciality

In [18]:
data.isnull().sum()

Unnamed: 0              0
description             0
medical_specialty       0
sample_name             0
transcription          33
keywords             1068
dtype: int64

# Drop the null values

In [19]:
data.dropna(axis=0,inplace=True)
data.shape

(3898, 6)

In [20]:
data1.isnull().sum()

Unnamed: 0           0
description          0
medical_specialty    0
sample_name          0
transcription        0
keywords             0
dtype: int64

# Remove classes having value counts < 50

In [21]:
data1=data.groupby('medical_specialty').filter(lambda x : len(x)>50)

In [22]:
data1

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
7,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram..."
9,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,..."
11,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,"2-D STUDY,1. Mild aortic stenosis, widely calc...","cardiovascular / pulmonary, 2-d study, doppler..."
...,...,...,...,...,...,...
4967,4967,"Left heart cath, selective coronary angiogram...",Cardiovascular / Pulmonary,Angiogram & StarClose Closure,"EXAM: , Left heart cath, selective coronary an...","cardiovascular / pulmonary, heart cath, select..."
4968,4968,Selective coronary angiography of the right c...,Cardiovascular / Pulmonary,Angiography & Catheterization - 1,"INDICATION:, Acute coronary syndrome.,CONSENT...","cardiovascular / pulmonary, acute coronary syn..."
4971,4971,A sample note on Angina.,Cardiovascular / Pulmonary,Angina,"ANGINA, is chest pain due to a lack of oxygen ...","cardiovascular / pulmonary, lack of oxygen, he..."
4972,4972,Adenosine with nuclear scan as the patient un...,Cardiovascular / Pulmonary,Adenosine Nuclear Scan,"INDICATION: , Chest pain.,TYPE OF TEST: , Aden...","cardiovascular / pulmonary, adenosine nuclear ..."


In [23]:
data1.medical_specialty.value_counts()

 Surgery                          1021
 Orthopedic                        303
 Cardiovascular / Pulmonary        280
 Radiology                         251
 Consult - History and Phy.        234
 Gastroenterology                  195
 Neurology                         168
 General Medicine                  146
 SOAP / Chart / Progress Notes     142
 Urology                           140
 Obstetrics / Gynecology           130
 ENT - Otolaryngology               84
 Neurosurgery                       81
 Ophthalmology                      79
 Discharge Summary                  77
 Nephrology                         63
 Hematology - Oncology              62
 Pain Management                    58
Name: medical_specialty, dtype: int64

In [24]:
data1.shape

(3514, 6)

# After filtering the 'medical_specialty' class values > 50, there are total 18 classes & 3514 observations in the database

In [27]:
data1.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
7,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram..."
9,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,..."
11,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,"2-D STUDY,1. Mild aortic stenosis, widely calc...","cardiovascular / pulmonary, 2-d study, doppler..."


In [28]:
print('Sample transcription 1:'+data1.iloc[4]['transcription']+'\n')
print('Sample transcription 2:'+data1.iloc[14]['transcription']+'\n')

Sample transcription 1:2-D STUDY,1. Mild aortic stenosis, widely calcified, minimally restricted.,2. Mild left ventricular hypertrophy but normal systolic function.,3. Moderate biatrial enlargement.,4. Normal right ventricle.,5. Normal appearance of the tricuspid and mitral valves.,6. Normal left ventricle and left ventricular systolic function.,DOPPLER,1. There is 1 to 2+ aortic regurgitation easily seen, but no aortic stenosis.,2. Mild tricuspid regurgitation with only mild increase in right heart pressures, 30-35 mmHg maximum.,SUMMARY,1. Normal left ventricle.,2. Moderate biatrial enlargement.,3. Mild tricuspid regurgitation, but only mild increase in right heart pressures.

Sample transcription 2:Sample Address,RE:  Sample Patient,Wife's name:  Sample Name,Dear Sample Doctor:,Mr. Sample Patient was seen on Month DD, YYYY, describing a vasectomy 10 years ago and a failed vasectomy reversal done almost two years ago at the University of Michigan.  He has remained azoospermic postoper

# Data Cleaning

In [30]:
# Stopwords

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Clean the Text: Normalize the text, remove special characters & stopwords

In [39]:
#compile a regular expression pattern

special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^a-z.]')
extra_space_remover=re.compile(' +')
STOPWORDS = set(stopwords.words('english'))

In [40]:
# Function to clean the text

def clean_text(text):
    text = text.lower()
    text = special_character_remover.sub(' ',text)
    text = extra_symbol_remover.sub(' ',text)
    text = extra_space_remover.sub(' ',text)
    text = text.replace('. .', '.')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [41]:
data1['transcription'] = data1['transcription'].apply(clean_text)

In [42]:
print('Sample transcription 1:'+data1.iloc[4]['transcription']+'\n')
print('Sample transcription 2:'+data1.iloc[14]['transcription']+'\n')

Sample transcription 1:study . mild aortic stenosis widely calcified minimally restricted . mild left ventricular hypertrophy normal systolic function . moderate biatrial enlargement . normal right ventricle . normal appearance tricuspid mitral valve . normal left ventricle left ventricular systolic function . doppler . aortic regurgitation easily seen aortic stenosis . mild tricuspid regurgitation mild increase right heart pressure mmhg maximum . summary . normal left ventricle . moderate biatrial enlargement . mild tricuspid regurgitation mild increase right heart pressure .

Sample transcription 2:sample address sample patient wife name sample name dear sample doctor mr. sample patient seen month dd yyyy describing vasectomy year ago failed vasectomy reversal done almost two year ago university michigan . remained azoospermic postoperatively . operative note suggests presence sperm sperm head right side time vasectomy reversal . state interested sperm harvesting cryopreservation pri

# Lemmatize the text

In [43]:
# Lemmatization function 

def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)      # sent_tokenize() method to split a document or paragraph into sentences.
    
    for sentence in sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))    
    return ' '.join(wordlist) 

In [37]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
# Lemmatization

data1['transcription'] = data1['transcription'].apply(lemmatize_text)

In [45]:
print('Sample transcription 1:'+data1.iloc[4]['transcription']+'\n')
print('Sample transcription 2:'+data1.iloc[14]['transcription']+'\n')

Sample transcription 1:study . mild aortic stenosis widely calcified minimally restricted . mild left ventricular hypertrophy normal systolic function . moderate biatrial enlargement . normal right ventricle . normal appearance tricuspid mitral valve . normal left ventricle left ventricular systolic function . doppler . aortic regurgitation easily seen aortic stenosis . mild tricuspid regurgitation mild increase right heart pressure mmhg maximum . summary . normal left ventricle . moderate biatrial enlargement . mild tricuspid regurgitation mild increase right heart pressure .

Sample transcription 2:sample address sample patient wife name sample name dear sample doctor mr. sample patient seen month dd yyyy describing vasectomy year ago failed vasectomy reversal done almost two year ago university michigan . remained azoospermic postoperatively . operative note suggests presence sperm sperm head right side time vasectomy reversal . state interested sperm harvesting cryopreservation pri

# Feature extraction using TfidfVectorizer 

In [60]:
vectorizer=TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75,min_df=5, use_idf=True, smooth_idf=True,sublinear_tf=True, max_features=1000)

In [61]:
tfIdfMat  = vectorizer.fit_transform(data1['transcription'].tolist() )
feature_names = sorted(vectorizer.get_feature_names())
feature_names



['abc',
 'abdomen',
 'abdomen soft',
 'abdominal',
 'abdominal pain',
 'abdominal wall',
 'able',
 'abnormal',
 'abnormality',
 'abscess',
 'access',
 'achieved',
 'active',
 'activity',
 'acute',
 'addition',
 'additional',
 'adequate',
 'adhesion',
 'administered',
 'admission',
 'admitted',
 'advanced',
 'age',
 'ago',
 'air',
 'alcohol',
 'alert',
 'allergy',
 'allowed',
 'alternative',
 'anastomosis',
 'anesthesia',
 'anesthesia administered',
 'anesthesia general',
 'anesthesia general endotracheal',
 'anesthetic',
 'angle',
 'ankle',
 'anterior',
 'anterior chamber',
 'anteriorly',
 'antibiotic',
 'aorta',
 'aortic',
 'ap',
 'apparent',
 'appear',
 'appearance',
 'appeared',
 'appearing',
 'appears',
 'appendix',
 'applied',
 'applied patient',
 'appropriate',
 'appropriately',
 'approximated',
 'approximately',
 'approximately cm',
 'area',
 'arm',
 'artery',
 'aspect',
 'aspiration',
 'assessment',
 'associated',
 'atrial',
 'attachment',
 'attempt',
 'attention',
 'attention 

In [62]:
dense=tfIdfMat.todense()
denselist = dense.tolist()

In [63]:
TFIDF_Vec=pd.DataFrame(denselist,columns=feature_names)

In [64]:
TFIDF_Vec.head()

Unnamed: 0,abc,abdomen,abdomen soft,abdominal,abdominal pain,abdominal wall,able,abnormal,abnormality,abscess,...,withdrawn,work,wound,wrist,xylocaine,xyz,year,year old,year old female,year old male
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093279,0.090893,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
TFIDF_Vec.shape

(3514, 1000)

# Apply PCA to reduce the no. of features

In [66]:
pca = PCA(n_components=0.95)
tfIdfMat_reduced = pca.fit_transform(tfIdfMat.toarray())

In [67]:
TFIDF_Reduced=pd.DataFrame(tfIdfMat_reduced)
TFIDF_Reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,598,599,600,601,602,603,604,605,606,607
0,0.117237,0.358186,-0.047838,0.175494,-0.055999,0.068378,0.105614,-0.056121,0.005843,0.13525,...,0.008237,0.011681,0.006124,0.013785,0.001879,0.011706,0.008091,-0.002625,0.014928,-0.025785
1,0.125226,0.384963,-0.013548,0.177744,-0.102406,0.061155,0.075413,-0.025485,-0.015631,0.109294,...,-0.00271,0.003435,0.011461,0.0161,0.017639,-0.001435,0.012895,-0.006814,-0.002729,0.000771
2,0.128541,0.355461,-0.054982,0.211206,-0.075382,0.077673,0.062262,-0.029506,8e-06,0.028834,...,-0.00879,-0.020444,0.009991,-0.006723,-0.002493,-0.011524,-8.2e-05,0.01428,-0.021199,0.005582
3,0.114096,0.306139,-0.036838,0.107297,-0.010754,0.125131,0.119144,-0.06531,-0.04779,0.12276,...,-0.002824,-0.012592,0.000235,0.001403,-0.006497,0.031626,-0.003231,-0.020758,0.011329,0.007847
4,0.140755,0.386922,-0.018073,0.221401,-0.058888,0.064055,0.066675,-0.03041,0.004317,0.13664,...,0.000501,-0.014547,-0.027126,-0.02662,-0.031514,-0.010197,0.001939,0.041212,-0.018484,-0.022258


# After applying PCA the no. of features reduced to 608 from 1000

In [68]:
labels = data1['medical_specialty'].tolist()
#labels


In [69]:
# Label encoder for class labels

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
print(encoded_labels)

[0 0 0 ... 0 0 0]


In [70]:
# Dataframe for class & class labels

classes=pd.DataFrame(encoded_labels,labels)
classes.head(20).T

Unnamed: 0,Cardiovascular / Pulmonary,Cardiovascular / Pulmonary.1,Cardiovascular / Pulmonary.2,Cardiovascular / Pulmonary.3,Cardiovascular / Pulmonary.4,Cardiovascular / Pulmonary.5,Urology,General Medicine,Urology.1,Urology.2,Urology.3,Urology.4,Urology.5,Urology.6,Urology.7,Urology.8,Urology.9,Urology.10,Urology.11,Urology.12
0,0,0,0,0,0,0,17,5,17,17,17,17,17,17,17,17,17,17,17,17


In [71]:
# Unique values from class labels

y=[]
for i in labels:
    if i not in y:
        y.append(i)
print(y)

[' Cardiovascular / Pulmonary', ' Urology', ' General Medicine', ' Surgery', ' SOAP / Chart / Progress Notes', ' Radiology', ' Pain Management', ' Orthopedic', ' Ophthalmology', ' Obstetrics / Gynecology', ' Neurosurgery', ' Neurology', ' Nephrology', ' Hematology - Oncology', ' Gastroenterology', ' ENT - Otolaryngology', ' Discharge Summary', ' Consult - History and Phy.']


In [72]:
# Unique values from class codes

x=[]
for i in encoded_labels:
    if i not in x:
        x.append(i)
print(x)

[0, 17, 5, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1]


In [73]:
# Dataframe to display class & class labels

df=pd.DataFrame(x,y)
df.T

Unnamed: 0,Cardiovascular / Pulmonary,Urology,General Medicine,Surgery,SOAP / Chart / Progress Notes,Radiology,Pain Management,Orthopedic,Ophthalmology,Obstetrics / Gynecology,Neurosurgery,Neurology,Nephrology,Hematology - Oncology,Gastroenterology,ENT - Otolaryngology,Discharge Summary,Consult - History and Phy.
0,0,17,5,16,15,14,13,12,11,10,9,8,7,6,4,3,2,1


In [74]:
X_train, X_test, y_train, y_test = train_test_split(tfIdfMat_reduced, encoded_labels, stratify=labels,random_state=1)

In [75]:
print("Train Data size:", X_train.shape)
print("Test Data size:", X_test.shape)

Train Data size: (2635, 608)
Test Data size: (879, 608)


# Build SVM Model

In [76]:
svm=SVC()
svm.fit(X_train,y_train)
y_pred=svm.predict(X_test)
#y_pred

In [77]:
print("Accuracy: \t",accuracy_score(y_test, y_pred))
print("Classification Report: \n",classification_report(y_test, y_pred))
#print("Confusion Matrix: \n",confusion_matrix(y_test, y_pred))

Accuracy: 	 0.27189988623435724
Classification Report: 
               precision    recall  f1-score   support

           0       0.26      0.20      0.23        70
           1       0.26      0.36      0.30        59
           2       0.26      0.32      0.29        19
           3       0.25      0.10      0.14        21
           4       0.00      0.00      0.00        49
           5       0.24      0.19      0.21        37
           6       0.00      0.00      0.00        15
           7       0.00      0.00      0.00        16
           8       0.22      0.24      0.23        42
           9       0.00      0.00      0.00        20
          10       0.00      0.00      0.00        32
          11       0.22      0.20      0.21        20
          12       0.08      0.05      0.06        76
          13       0.90      0.64      0.75        14
          14       0.16      0.14      0.15        63
          15       0.18      0.17      0.17        36
          16       0.36 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(svm, param_grid)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=100, gamma=0.001)


In [78]:
svm=SVC(C=100,kernel='rbf',gamma=0.001)
svm.fit(X_train,y_train)
y_pred=svm.predict(X_test)

In [79]:
print("Accuracy: \t",accuracy_score(y_test, y_pred))
print("Classification Report: \n",classification_report(y_test, y_pred))

Accuracy: 	 0.42775881683731515
Classification Report: 
               precision    recall  f1-score   support

           0       0.51      0.33      0.40        70
           1       0.31      0.85      0.45        59
           2       0.48      0.74      0.58        19
           3       0.00      0.00      0.00        21
           4       0.00      0.00      0.00        49
           5       0.00      0.00      0.00        37
           6       0.00      0.00      0.00        15
           7       0.00      0.00      0.00        16
           8       0.44      0.36      0.39        42
           9       0.00      0.00      0.00        20
          10       0.00      0.00      0.00        32
          11       0.00      0.00      0.00        20
          12       0.21      0.04      0.07        76
          13       0.00      0.00      0.00        14
          14       0.38      0.43      0.40        63
          15       0.00      0.00      0.00        36
          16       0.47 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest Classifier

In [80]:
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100) 
 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

In [81]:
print("Accuracy: \t",accuracy_score(y_test, y_pred))
print("Classification Report: \n",classification_report(y_test, y_pred))

Accuracy: 	 0.15358361774744028
Classification Report: 
               precision    recall  f1-score   support

           0       0.14      0.11      0.13        70
           1       0.21      0.27      0.24        59
           2       0.12      0.11      0.11        19
           3       0.00      0.00      0.00        21
           4       0.00      0.00      0.00        49
           5       0.11      0.05      0.07        37
           6       0.00      0.00      0.00        15
           7       0.00      0.00      0.00        16
           8       0.14      0.17      0.15        42
           9       0.00      0.00      0.00        20
          10       0.03      0.03      0.03        32
          11       0.06      0.05      0.05        20
          12       0.02      0.01      0.02        76
          13       1.00      0.29      0.44        14
          14       0.11      0.11      0.11        63
          15       0.08      0.06      0.07        36
          16       0.23 

In [108]:
param_grid = {'max_depth': [5,6,7,8,9],
              'n_estimators': [50,100,200,300,500],
              'max_features': [20,30,40,60,100]}
 
grid = GridSearchCV(clf, param_grid)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'max_depth': 5, 'max_features': 100, 'n_estimators': 200}
RandomForestClassifier(max_depth=5, max_features=100, n_estimators=200)
