In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [4]:
data = pd.read_csv('data/train.csv')

In [5]:
metaphor_words = {0:'road', 1:'candle', 2: 'light', 3: 'spice', 4: 'ride', 5: 'train', 6: 'boat'}

In [6]:
data.shape

(1870, 3)

In [7]:
data.describe

<bound method NDFrame.describe of       metaphorID  label_boolean  \
0              0           True   
1              2          False   
2              2          False   
3              4          False   
4              2          False   
...          ...            ...   
1865           4           True   
1866           4           True   
1867           0           True   
1868           5           True   
1869           2           True   

                                                   text  
0     Hey , Karen !!!! I was told that on the day of...  
1     Hi Ladies ... my last chemo was Feb 17/09 , ra...  
2     I have just come form my consult with a lovely...  
3     I also still question taking Tamox for stage 1...  
4     Just checking in to say hello ladies . I had a...  
...                                                 ...  
1865  Hi there . I found my lump 3 weeks ago and it ...  
1866  Robyn-Sorry you find yourself on this web site...  
1867  I 'm happy Jule t

In [8]:
data.head()

Unnamed: 0,metaphorID,label_boolean,text
0,0,True,"Hey , Karen !!!! I was told that on the day of..."
1,2,False,"Hi Ladies ... my last chemo was Feb 17/09 , ra..."
2,2,False,I have just come form my consult with a lovely...
3,4,False,I also still question taking Tamox for stage 1...
4,2,False,Just checking in to say hello ladies . I had a...


In [9]:
positive_label_data = data[data['label_boolean'] == True]
negative_label_data = data[data['label_boolean'] == False]

In [10]:
positive_label_data

Unnamed: 0,metaphorID,label_boolean,text
0,0,True,"Hey , Karen !!!! I was told that on the day of..."
5,2,True,How is everyone doing ? I 'm 10 weeks post op ...
7,0,True,I had a bilateral mastecomy in June followed b...
8,6,True,Just diagnosed late November . Stage I and wit...
9,0,True,"swimangel172 , With regard to my comments I di..."
...,...,...,...
1865,4,True,Hi there . I found my lump 3 weeks ago and it ...
1866,4,True,Robyn-Sorry you find yourself on this web site...
1867,0,True,I 'm happy Jule that you posted this question ...
1868,5,True,Hiya April RADs-I should probably have been he...


In [12]:
negative_label_data

Unnamed: 0,metaphorID,label_boolean,text
1,2,False,"Hi Ladies ... my last chemo was Feb 17/09 , ra..."
2,2,False,I have just come form my consult with a lovely...
3,4,False,I also still question taking Tamox for stage 1...
4,2,False,Just checking in to say hello ladies . I had a...
6,2,False,"Hello Jodi , It 's going to be OK . I just had..."
...,...,...,...
1826,0,False,My oncologist did n't make any reference to th...
1842,3,False,Hello All I just finished 4 rounds of AC and 1...
1856,4,False,"Good Morning Jewels , SheShe-We must have post..."
1861,2,False,"I had a mastectomy in Aug 09 , Stage2 , Grade ..."


In [13]:
vectorizer = CountVectorizer(stop_words='english', max_features=1000)

In [14]:
X = vectorizer.fit_transform(data['text'])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, data['label_boolean'], test_size=0.2, random_state=42)

In [16]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.7967914438502673
Precision: 0.8284
Recall: 0.9127
F1 Score: 0.8685


In [19]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.8181818181818182
Precision: 0.8819
Recall: 0.8691
F1 Score: 0.8755


In [21]:
data['word'] = data['metaphorID'].map(metaphor_words)