In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('transcription_samples.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [4]:
df = df.dropna(axis=0, subset=['transcription'])

In [5]:
df.medical_specialty.value_counts()

 Surgery                          1088
 Consult - History and Phy.        516
 Cardiovascular / Pulmonary        371
 Orthopedic                        355
 Radiology                         273
 General Medicine                  259
 Gastroenterology                  224
 Neurology                         223
 SOAP / Chart / Progress Notes     166
 Urology                           156
 Obstetrics / Gynecology           155
 Discharge Summary                 108
 ENT - Otolaryngology               96
 Neurosurgery                       94
 Hematology - Oncology              90
 Ophthalmology                      83
 Nephrology                         81
 Emergency Room Reports             75
 Pediatrics - Neonatal              70
 Pain Management                    61
 Psychiatry / Psychology            53
 Office Notes                       50
 Podiatry                           47
 Dermatology                        29
 Dentistry                          27
 Cosmetic / Plastic Surge

In [6]:
df.transcription.isnull().value_counts()

False    4966
Name: transcription, dtype: int64

In [7]:
def clean_medical_specialty(df):
    taken = [' Surgery',' Consult - History and Phy.',' Cardiovascular / Pulmonary',' Orthopedic']
    medical_specialty = df.medical_specialty.unique().tolist()

    for i in range(len(medical_specialty)):
        if medical_specialty[i] not in taken:
            df = df[df.medical_specialty != medical_specialty[i]]
    
    return df

# df = clean_medical_specialty(df)

In [8]:
######### balancing dataset
def balance_dataset(df):
    df1 = df[df.medical_specialty == ' Consult - History and Phy.'].iloc[:355]
    df2 = df[df.medical_specialty == ' Cardiovascular / Pulmonary'].iloc[:355]
    df3 = df[df.medical_specialty == ' Orthopedic'].iloc[:355]
    df4 = df[df.medical_specialty == ' Surgery'].iloc[:355]
    
    frames = [df1, df2, df3, df4]
    return pd.concat(frames)

df = balance_dataset(df)

In [9]:
df.shape

(1420, 6)

In [10]:
df.dtypes

Unnamed: 0            int64
description          object
medical_specialty    object
sample_name          object
transcription        object
keywords             object
dtype: object

In [11]:
df['medical_specialty'] = df['medical_specialty'].astype('category')

In [12]:
df["medical_specialty_cat"] = df["medical_specialty"].cat.codes

In [13]:
messages = df.transcription.tolist()

In [14]:
########## finding map class to integer value #############
class0 = df[df.medical_specialty_cat == 0].iloc[0].medical_specialty
class1 = df[df.medical_specialty_cat == 1].iloc[0].medical_specialty
class2 = df[df.medical_specialty_cat == 2].iloc[0].medical_specialty
class3 = df[df.medical_specialty_cat == 3].iloc[0].medical_specialty
classes = {0 : class0, 1 : class1, 2 : class2, 3 : class3}

In [15]:
classes

{0: ' Cardiovascular / Pulmonary',
 1: ' Consult - History and Phy.',
 2: ' Orthopedic',
 3: ' Surgery'}

In [16]:
#Data cleaning and preprocessing
import re
import nltk
# nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(0, len(messages)):
    message = re.sub('[^a-zA-Z]', ' ', str(messages[i]))
    message = message.lower()
    message = message.split()
    
    message = [lemmatizer.lemmatize(word) for word in message if not word in stopwords.words('english')]
    message = ' '.join(message)
    corpus.append(message)

In [17]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(corpus)
X = vectorizer.transform(corpus).toarray()

In [18]:
X.shape

(1420, 14747)

In [19]:
vectorizer.transform([corpus[0]]).shape

(1, 14747)

In [20]:
vectorizer.transform([corpus[0]]).toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [21]:
y = df["medical_specialty"].cat.codes

In [22]:
# Train Test Split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [23]:
sm = SMOTE(random_state=2)
X_train, y_train = sm.fit_sample(X_train, y_train.ravel())

In [24]:
X_test[0].shape

(14747,)

In [25]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [27]:
################# NB Classifier
NB_model = MultinomialNB().fit(X_train, y_train)

In [28]:
naive_bayes_prediction = NB_model.predict(X_test)

In [30]:
naive_bayes_prediction

array([3, 3, 3, 3, 1, 2, 3, 1, 0, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 3, 2, 0,
       3, 2, 3, 0, 2, 1, 3, 3, 0, 2, 0, 0, 2, 2, 0, 1, 1, 2, 0, 1, 3, 2,
       3, 2, 2, 3, 2, 0, 1, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 3, 3, 1, 1, 1,
       3, 0, 0, 3, 1, 1, 1, 0, 2, 1, 1, 2, 0, 0, 3, 2, 3, 2, 1, 2, 1, 1,
       2, 2, 1, 2, 1, 2, 1, 3, 1, 1, 1, 2, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3,
       1, 0, 1, 1, 1, 2, 3, 2, 3, 1, 1, 3, 3, 0, 2, 1, 3, 1, 2, 1, 3, 0,
       2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 1, 0, 3, 3, 1, 3, 2, 1, 1, 2, 3, 0,
       3, 3, 2, 1, 3, 3, 1, 1, 1, 0, 0, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1,
       1, 1, 1, 1, 1, 0, 0, 3, 2, 1, 0, 1, 0, 3, 1, 3, 2, 1, 0, 3, 3, 1,
       3, 1, 0, 3, 1, 0, 3, 1, 1, 1, 0, 1, 2, 3, 1, 1, 1, 1, 2, 1, 1, 2,
       3, 0, 1, 2, 3, 1, 3, 2, 3, 0, 2, 3, 1, 2, 0, 2, 3, 2, 0, 1, 3, 1,
       1, 3, 1, 3, 0, 3, 1, 3, 2, 2, 1, 2, 1, 1, 3, 1, 1, 0, 1, 0, 1, 1,
       1, 2, 3, 3, 2, 2, 2, 2, 1, 3, 1, 3, 0, 2, 1, 1, 0, 2, 0, 1],
      dtype=int8)

In [31]:
################# SVC Classifier
SVC_model = SVC(kernel='poly').fit(X_train, y_train)

In [32]:
SVC_prediction = SVC_model.predict(X_test)

In [33]:
SVC_prediction

array([3, 3, 3, 3, 1, 3, 3, 1, 0, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 3, 2, 0,
       3, 2, 3, 0, 2, 1, 3, 3, 0, 2, 3, 0, 2, 2, 3, 1, 1, 2, 0, 1, 3, 2,
       3, 3, 2, 3, 2, 0, 1, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 3, 3, 1, 1, 1,
       3, 3, 0, 3, 1, 1, 1, 3, 2, 1, 1, 2, 3, 0, 3, 2, 0, 2, 1, 2, 1, 1,
       2, 2, 1, 2, 2, 2, 1, 0, 1, 1, 1, 2, 1, 3, 3, 1, 1, 2, 3, 2, 3, 3,
       0, 0, 1, 1, 1, 2, 3, 2, 3, 1, 0, 3, 3, 0, 2, 1, 3, 1, 3, 1, 3, 0,
       2, 1, 1, 1, 1, 1, 3, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 3, 3, 0,
       3, 3, 2, 1, 3, 3, 1, 1, 1, 0, 0, 1, 3, 3, 3, 3, 3, 1, 1, 0, 2, 0,
       1, 1, 2, 1, 3, 0, 3, 3, 3, 2, 0, 1, 0, 0, 1, 3, 2, 1, 0, 3, 3, 1,
       3, 0, 3, 3, 1, 0, 3, 1, 2, 2, 0, 1, 2, 3, 1, 1, 0, 3, 3, 1, 1, 2,
       3, 0, 1, 3, 3, 1, 0, 2, 3, 0, 2, 3, 1, 2, 0, 2, 3, 2, 0, 1, 0, 1,
       1, 3, 1, 3, 0, 3, 1, 3, 2, 2, 3, 2, 1, 1, 3, 1, 1, 0, 1, 0, 0, 1,
       2, 3, 3, 3, 2, 3, 2, 2, 1, 3, 1, 3, 3, 3, 1, 1, 0, 2, 0, 1],
      dtype=int8)

In [34]:
################# KNN Classifier
KNN_model = KNeighborsClassifier(n_neighbors=4).fit(X_train, y_train)

In [35]:
KNN_prediction = KNN_model.predict(X_test)

In [36]:
KNN_prediction

array([3, 0, 3, 3, 1, 3, 3, 1, 0, 1, 0, 2, 1, 1, 1, 3, 0, 1, 1, 3, 2, 0,
       3, 1, 3, 0, 2, 1, 3, 3, 0, 2, 3, 0, 2, 2, 3, 1, 1, 0, 0, 1, 3, 2,
       3, 2, 2, 3, 2, 0, 1, 1, 3, 0, 1, 1, 1, 1, 0, 0, 1, 3, 3, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 3, 2, 1, 0, 2, 0, 0, 3, 3, 0, 2, 1, 2, 1, 1,
       2, 3, 1, 2, 2, 2, 2, 0, 1, 1, 1, 2, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3,
       0, 0, 1, 1, 1, 2, 3, 2, 3, 0, 0, 3, 3, 0, 2, 1, 3, 1, 3, 1, 0, 0,
       2, 1, 0, 1, 1, 1, 0, 2, 2, 1, 1, 1, 3, 3, 2, 3, 2, 1, 1, 2, 3, 0,
       3, 3, 2, 1, 3, 3, 1, 1, 1, 0, 0, 1, 3, 3, 3, 3, 3, 1, 1, 0, 2, 1,
       1, 0, 2, 1, 3, 0, 3, 3, 2, 1, 0, 0, 0, 0, 1, 3, 2, 1, 0, 3, 3, 1,
       3, 0, 3, 3, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3, 0, 1, 0, 1, 2, 1, 1, 2,
       3, 0, 1, 2, 3, 1, 3, 2, 3, 0, 2, 0, 1, 2, 0, 2, 3, 2, 0, 1, 3, 1,
       1, 3, 1, 3, 0, 3, 1, 3, 2, 2, 1, 2, 1, 0, 3, 0, 1, 0, 0, 0, 1, 1,
       2, 2, 3, 3, 2, 2, 2, 2, 0, 3, 1, 3, 3, 2, 1, 1, 0, 2, 0, 1],
      dtype=int8)

In [37]:
################# DecisionTree Classifier
DecisionTree_model = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)

In [38]:
decisionTree_prediction = DecisionTree_model.predict(X_test)

In [39]:
decisionTree_prediction

array([3, 3, 2, 3, 1, 3, 3, 1, 0, 0, 3, 2, 1, 1, 3, 0, 1, 1, 1, 3, 2, 0,
       3, 2, 3, 2, 2, 1, 3, 3, 0, 2, 0, 0, 2, 2, 3, 1, 1, 2, 0, 0, 3, 2,
       0, 3, 1, 3, 2, 2, 1, 2, 3, 1, 1, 1, 1, 3, 0, 2, 0, 0, 3, 0, 0, 1,
       3, 0, 0, 2, 1, 1, 1, 3, 2, 1, 0, 2, 0, 0, 2, 2, 0, 2, 1, 2, 1, 0,
       2, 2, 1, 1, 1, 3, 1, 0, 2, 1, 2, 2, 1, 2, 0, 3, 0, 2, 3, 2, 1, 3,
       0, 0, 1, 1, 1, 2, 3, 2, 0, 1, 0, 3, 3, 0, 2, 0, 3, 2, 1, 1, 3, 1,
       2, 1, 0, 1, 1, 1, 3, 2, 2, 0, 0, 0, 3, 3, 2, 3, 2, 1, 1, 3, 3, 0,
       3, 3, 2, 1, 0, 3, 1, 1, 1, 3, 0, 1, 3, 0, 3, 3, 3, 0, 1, 0, 2, 0,
       1, 2, 2, 1, 1, 0, 3, 3, 2, 2, 0, 1, 0, 0, 1, 2, 2, 0, 0, 3, 3, 1,
       3, 0, 3, 0, 0, 0, 3, 2, 2, 2, 0, 1, 2, 3, 0, 1, 1, 3, 3, 1, 1, 2,
       3, 3, 1, 2, 3, 0, 0, 2, 0, 0, 3, 3, 1, 2, 0, 2, 3, 2, 0, 1, 0, 0,
       1, 3, 1, 3, 2, 3, 1, 3, 2, 2, 3, 2, 1, 1, 3, 0, 1, 2, 0, 3, 0, 1,
       2, 3, 3, 3, 3, 3, 2, 2, 1, 0, 2, 0, 0, 3, 1, 1, 0, 3, 0, 1],
      dtype=int8)

In [40]:
################# Random forest Classifier
RandomForest_model=RandomForestClassifier(n_estimators=100).fit(X_train,y_train)

In [41]:
random_forest_prediction=RandomForest_model.predict(X_test)

In [42]:
random_forest_prediction

array([3, 3, 3, 3, 1, 3, 3, 1, 3, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 3, 2, 0,
       3, 2, 3, 1, 2, 1, 3, 3, 0, 2, 0, 0, 2, 2, 0, 1, 1, 2, 0, 1, 3, 2,
       3, 3, 2, 3, 2, 0, 1, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 3, 3, 1, 1, 1,
       3, 0, 0, 3, 1, 1, 1, 0, 2, 1, 1, 2, 3, 0, 3, 2, 0, 2, 1, 2, 1, 1,
       2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 1, 2, 1, 3, 3, 3, 1, 2, 3, 2, 2, 3,
       0, 0, 1, 1, 1, 3, 3, 2, 3, 1, 0, 3, 3, 0, 2, 1, 3, 1, 2, 1, 3, 0,
       2, 1, 0, 1, 1, 1, 3, 3, 2, 0, 1, 0, 3, 3, 2, 3, 2, 1, 1, 3, 3, 0,
       3, 3, 2, 1, 3, 3, 1, 1, 1, 0, 0, 1, 3, 3, 3, 3, 3, 1, 1, 0, 2, 0,
       1, 1, 2, 1, 1, 0, 3, 3, 3, 2, 0, 1, 0, 0, 1, 3, 2, 1, 0, 3, 3, 1,
       3, 0, 3, 3, 1, 0, 3, 1, 2, 2, 0, 1, 2, 3, 1, 1, 1, 3, 3, 1, 1, 2,
       3, 0, 1, 3, 3, 1, 0, 2, 3, 0, 3, 3, 1, 2, 0, 2, 3, 2, 0, 1, 3, 1,
       1, 3, 1, 3, 0, 3, 1, 3, 2, 2, 3, 2, 1, 0, 3, 0, 1, 0, 1, 0, 0, 1,
       2, 3, 3, 3, 2, 3, 2, 2, 1, 3, 1, 3, 0, 3, 1, 1, 0, 2, 0, 1],
      dtype=int8)

In [43]:
#################LogisticRegression Classifier
LogisticRegression_model = LogisticRegression(solver='liblinear', C=10.0, random_state=0).fit(X_train,y_train)

In [44]:
LogisticRegression_prediction = LogisticRegression_model.predict(X_test)

In [45]:
LogisticRegression_prediction

array([3, 3, 3, 3, 1, 3, 3, 1, 0, 1, 3, 2, 1, 1, 1, 3, 1, 1, 1, 3, 2, 0,
       3, 2, 3, 0, 2, 1, 3, 3, 0, 2, 0, 0, 2, 2, 0, 1, 1, 2, 0, 1, 3, 2,
       3, 3, 2, 3, 2, 0, 1, 1, 3, 0, 1, 1, 1, 1, 0, 0, 1, 3, 3, 1, 1, 1,
       3, 0, 0, 3, 1, 1, 1, 0, 2, 1, 1, 2, 3, 0, 3, 2, 0, 2, 1, 2, 1, 1,
       2, 2, 1, 1, 2, 2, 2, 0, 1, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 2, 3,
       0, 0, 1, 1, 1, 2, 3, 2, 3, 1, 0, 3, 3, 0, 2, 1, 3, 1, 2, 2, 3, 0,
       2, 1, 0, 1, 1, 1, 3, 2, 2, 0, 1, 0, 3, 3, 2, 3, 2, 1, 1, 3, 3, 0,
       3, 3, 2, 1, 3, 3, 1, 1, 1, 0, 0, 1, 3, 3, 3, 0, 3, 1, 1, 0, 2, 0,
       1, 1, 2, 1, 1, 0, 3, 3, 2, 2, 0, 1, 0, 0, 1, 3, 2, 1, 0, 3, 3, 1,
       3, 0, 3, 3, 1, 0, 3, 1, 2, 2, 0, 1, 2, 3, 0, 1, 0, 3, 3, 1, 1, 2,
       3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 2, 3, 1, 2, 0, 2, 3, 2, 0, 1, 0, 2,
       2, 3, 1, 3, 0, 3, 1, 3, 2, 2, 3, 2, 1, 0, 3, 0, 1, 0, 0, 0, 0, 1,
       2, 3, 3, 3, 2, 3, 2, 2, 1, 3, 1, 3, 0, 3, 0, 1, 0, 2, 0, 1],
      dtype=int8)

In [38]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

print('naive_bayes f1_score: {}'.format(f1_score(y_test,naive_bayes_prediction, average='macro')))
print('KNN f1_score: {}'.format(f1_score(y_test,KNN_prediction, average='macro')))
print('SVC f1_score: {}'.format(f1_score(y_test,SVC_prediction, average='macro')))
print('Decision tree f1_score: {}'.format(f1_score(y_test,decisionTree_prediction, average='macro')))
print('Random_forest f1_score: {}'.format(f1_score(y_test,random_forest_prediction, average='macro')))
print('LogisticRegression f1_score: {}'.format(f1_score(y_test,LogisticRegression_prediction, average='macro')))

naive_bayes f1_score: 0.7639912981887098
KNN f1_score: 0.7473816977209109
SVC f1_score: 0.7499168442801605
Decision tree f1_score: 0.5822290911830412
Random_forest f1_score: 0.7042185370638033
LogisticRegression f1_score: 0.7615167201836806


In [39]:
print('naive_bayes accuracy: {}'.format(accuracy_score(y_test, naive_bayes_prediction)))
print('KNN accuracy: {}'.format(accuracy_score(y_test, KNN_prediction)))
print('SVC accuracy: {}'.format(accuracy_score(y_test, SVC_prediction)))
print('DecisionTree accuracy: {}'.format(accuracy_score(y_test, decisionTree_prediction)))
print('Random_forest accuracy: {}'.format(accuracy_score(y_test, random_forest_prediction)))
print('LogisticRegression accuracy: {}'.format(accuracy_score(y_test, LogisticRegression_prediction)))

naive_bayes accuracy: 0.7711267605633803
KNN accuracy: 0.75
SVC accuracy: 0.7535211267605634
DecisionTree accuracy: 0.5845070422535211
Random_forest accuracy: 0.7112676056338029
LogisticRegression accuracy: 0.7640845070422535


In [41]:
print('naive_bayes recall_score: {}'.format(recall_score(y_test,naive_bayes_prediction, average='macro')))
print('KNN recall_score: {}'.format(recall_score(y_test,KNN_prediction, average='macro')))
print('SVC recall_score: {}'.format(recall_score(y_test,SVC_prediction, average='macro')))
print('Decision tree recall_score: {}'.format(recall_score(y_test,decisionTree_prediction, average='macro')))
print('Random_forest recall_score: {}'.format(recall_score(y_test,random_forest_prediction, average='macro')))
print('LogisticRegression recall_score: {}'.format(recall_score(y_test,LogisticRegression_prediction, average='macro')))

naive_bayes recall_score: 0.7679563492063493
KNN recall_score: 0.7471289146289146
SVC recall_score: 0.7515787215787216
Decision tree recall_score: 0.5822361647361647
Random_forest recall_score: 0.705725546975547
LogisticRegression recall_score: 0.7617771342771342


In [42]:
print('naive_bayes precision_score: {}'.format(precision_score(y_test,naive_bayes_prediction, average='macro')))
print('KNN precision_score: {}'.format(precision_score(y_test,KNN_prediction, average='macro')))
print('SVC precision_score: {}'.format(precision_score(y_test,SVC_prediction, average='macro')))
print('Decision tree precision_score: {}'.format(precision_score(y_test,decisionTree_prediction, average='macro')))
print('Random_forest precision_score: {}'.format(precision_score(y_test,random_forest_prediction, average='macro')))
print('LogisticRegression precision_score: {}'.format(precision_score(y_test,LogisticRegression_prediction, average='macro')))

naive_bayes precision_score: 0.8054501094871154
KNN precision_score: 0.7545627285539974
SVC precision_score: 0.7627630967390653
Decision tree precision_score: 0.5824324324324325
Random_forest precision_score: 0.719736732185266
LogisticRegression precision_score: 0.7671847979634299


In [43]:
confusion_matrix(y_test, naive_bayes_prediction)

array([[39, 20,  0, 13],
       [ 0, 74,  0,  0],
       [ 0, 16, 46,  1],
       [ 2,  1, 12, 60]])

In [44]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()

In [45]:
model.fit(X_train, y_train)

GradientBoostingClassifier()

In [46]:
GBC_prediction = model.predict(X_test)

In [47]:
print('GradientBoostingClassifier f1_score: {}'.format(f1_score(y_test,GBC_prediction, average='macro')))

GradientBoostingClassifier f1_score: 0.7219646247264613


In [48]:
print('GradientBoostingClassifier accuracy: {}'.format(accuracy_score(y_test, GBC_prediction)))

GradientBoostingClassifier accuracy: 0.7253521126760564


In [49]:
from xgboost import XGBClassifier

In [50]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [50]:
XGB_prediction = model.predict(X_test)

In [51]:
print('XGBClassifier f1_score: {}'.format(f1_score(y_test,XGB_prediction, average='macro')))
print('XGBClassifier accuracy: {}'.format(accuracy_score(y_test, GBC_prediction)))

XGBClassifier f1_score: 0.7172961992497626
XGBClassifier accuracy: 0.7183098591549296


In [53]:
import pickle
pickle_out = open("models/KNN_model.pkl","wb")
pickle.dump(KNN_model, pickle_out)
pickle_out.close()

In [143]:
import pickle
# saving models
with open('models/NB_model.pickle', 'wb') as NB_modelhandle:
    pickle.dump(NB_model, NB_modelhandle, protocol=pickle.HIGHEST_PROTOCOL)
    NB_modelhandle.close()

with open('models/SVC_model.pickle', 'wb') as SVC_modelhandle:
    pickle.dump(SVC_model, SVC_modelhandle, protocol=pickle.HIGHEST_PROTOCOL)
    SVC_modelhandle.close()
    
with open('models/KNN_model.pickle', 'wb') as KNN_modelhandle:
    pickle.dump(KNN_model, KNN_modelhandle, protocol=pickle.HIGHEST_PROTOCOL)
    KNN_modelhandle.close()
    
with open('models/DecisionTree_model.pickle', 'wb') as DecisionTree_modelhandle:
    pickle.dump(DecisionTree_model, DecisionTree_modelhandle, protocol=pickle.HIGHEST_PROTOCOL)
    DecisionTree_modelhandle.close()
    
with open('models/RandomForest_model.pickle', 'wb') as RandomForest_modelhandle:
    pickle.dump(RandomForest_model, RandomForest_modelhandle, protocol=pickle.HIGHEST_PROTOCOL)
    RandomForest_modelhandle.close()
    
with open('models/LogisticRegression_model.pickle', 'wb') as LogisticRegression_modelhandle:
    pickle.dump(LogisticRegression_model, LogisticRegression_modelhandle, protocol=pickle.HIGHEST_PROTOCOL)
    LogisticRegression_modelhandle.close()
    
with open('models/vectorizer.pickle', 'wb') as vectorizerhandle:
    pickle.dump(vectorizer, vectorizerhandle, protocol=pickle.HIGHEST_PROTOCOL)
    vectorizerhandle.close()


In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
		"The dog.",
		"The fox"]
test_text = 'the fox is bad'
vectorizer = TfidfVectorizer()
vectorizer.fit(text)

# encode document
vector = vectorizer.transform([test_text])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 8)
[[0.         0.         0.78980693 0.         0.         0.
  0.         0.61335554]]


In [170]:
with open('models/vectorizer.pickle', 'rb') as vectorizerhandle:
    classical_vectorizer = pickle.load(vectorizerhandle)
    vectorizerhandle.close()

In [172]:
vectorizer.transform([corpus[0]]).toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [57]:
max_index

0