In [183]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import re

#### 1 - Negative for cancer
#### 0 - Postive for cancer 

In [184]:
# Read the raw dataset
raw_train_data = pd.read_csv('../data/training_data.csv', index_col=['SrNo'])

In [185]:
# data distribution
raw_train_data.groupby(['Category']).count()

Unnamed: 0_level_0,Text
Category,Unnamed: 1_level_1
0,192
1,160


In [186]:
raw_train_data.head(5)

Unnamed: 0_level_0,Text,Category
SrNo,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Negative for carcinoma and high grade dysplasia,1
2,The surgical margins are negative for carcinoma.,1
3,PROXIMAL MARGIN OF RESECTION: NEGATIVE FOR CA...,1
4,DISTAL MARGIN OF RESECTION: NEGATIVE FOR CARC...,1
5,CIRCUMFERENTIAL MARGIN OF RESECTION: NEGATIVE...,1


In [187]:
# shuffle data frame
import sklearn.utils
raw_train_data = sklearn.utils.shuffle(raw_train_data)

In [188]:
raw_train_data.head()

Unnamed: 0_level_0,Text,Category
SrNo,Unnamed: 1_level_1,Unnamed: 2_level_1
319,The tumor cells are cuboidal to columnar with...,0
347,RIGHT BREAST 8 O'CLOCK: INVASIVE DUCTAL ADENO...,0
128,"Hepatic artery lymph node, biopsy: Benign lym...",1
45,D: Histologic sections demonstrate lymphoid t...,1
208,Histologic sections demonstrate multiple frag...,0


### Preprocessing

In [189]:
def preprocessing(data_frame):
    # Lower case all the sentences
    data_frame['Text'] = data_frame['Text'].str.lower()

    # replace `:`
    data_frame['Text'] = data_frame['Text'].apply(lambda x: x.replace(':', ' '))

    # replace *
    data_frame['Text'] = data_frame['Text'].apply(lambda x: x.replace('*', ' '))

    # replace numeric bullet points
    data_frame['Text'] = data_frame['Text'].apply(lambda x: re.sub('([0-9A-Za-z]+[.:)]|•)\s+', "", x))

    # remove the double spaces
    data_frame['Text'] = data_frame['Text'].apply(lambda x: re.sub('\s\s+', " ", x.strip()))

    return data_frame

In [190]:
clean_train_data = preprocessing(raw_train_data)
clean_train_data.head()

Unnamed: 0_level_0,Text,Category
SrNo,Unnamed: 1_level_1,Unnamed: 2_level_1
319,the tumor cells are cuboidal to columnar with ...,0
347,right breast 8 o'clock invasive ductal adenoca...,0
128,"hepatic artery lymph node, biopsy benign lymph...",1
45,d histologic sections demonstrate lymphoid tis...,1
208,histologic sections demonstrate multiple fragm...,0


In [191]:
x_train, y_train = list(clean_train_data['Text']), list(clean_train_data['Category'])

In [192]:
list(x_train)

['the tumor cells are cuboidal to columnar with eosinophilic cytoplasm and prominent nucleoli and are positive for cytokeratin 7, dpc4 and ca19-9, with patchy positivity for heppar-1 and arginase 1.',
 "right breast 8 o'clock invasive ductal adenocarcinoma, elston grade i/iii.",
 'hepatic artery lymph node, biopsy benign lymph node (with lipogranuloma; negative for carcinoma.',
 'd histologic sections demonstrate lymphoid tissue containing sinus histiocytosis without evidence of metastatic carcinoma.',
 'histologic sections demonstrate multiple fragments of prostate tissue involved by high grade adenocarcinoma showing immunohistochemical staining for psa.',
 'rectosigmoid colon, mass, biopsy invasive colonic adenocarcinoma, moderately differentiated',
 'bladder neck negative for carcinoma.',
 'right pelvic lymph node one lymph node negative for metastatic carcinoma.',
 'the findings are consistent with a focal area of invasive adenocarcinoma arising within the tubulovillous adenoma sta

### Feature Extraction

In [193]:
# Step-1: Feature generation
vectorizer = CountVectorizer(analyzer='word', stop_words=['the', 'and', 'is', ''],)
x_train = vectorizer.fit_transform(x_train)

### Model Training

In [194]:
model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Model Testing

In [195]:
# Read the raw dataset
raw_test_data = pd.read_csv('../data/test_data.csv', index_col=['SrNo'])

In [196]:
clean_test_data = preprocessing(raw_test_data)
clean_test_data.head()

Unnamed: 0_level_0,Text,Category
SrNo,Unnamed: 1_level_1,Unnamed: 2_level_1
1,surgical margin status proximal margin negativ...,1
2,proximal and distal resection margins appear n...,1
3,twelve lymph nodes negative for metastatic ade...,1
4,surgical margin status proximal margin negativ...,1
5,distal margin negative for carcinoma.,1


In [197]:
x_test, actual_result = list(clean_test_data['Text']), list(clean_test_data['Category'])

In [198]:
x_test = vectorizer.transform(x_test)
prediction = model.predict(x_test)
print(prediction)

[1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [181]:
print(list(actual_result))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [201]:
from sklearn.metrics import confusion_matrix
results = confusion_matrix(actual_result, prediction)
results

array([[29,  1],
       [ 5, 34]], dtype=int64)

In [204]:
from sklearn.metrics import classification_report
print(classification_report(actual_result, prediction, target_names=['No', 'Yes']))

              precision    recall  f1-score   support

          No       0.85      0.97      0.91        30
         Yes       0.97      0.87      0.92        39

    accuracy                           0.91        69
   macro avg       0.91      0.92      0.91        69
weighted avg       0.92      0.91      0.91        69

