In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('training_data.csv')

### Classification with TFIDF and SVM

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
text = df['clean_text']
y = df['target']

In [5]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(text)

In [6]:
# split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# svm model
svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       869
           1       0.82      0.72      0.77       654

    accuracy                           0.81      1523
   macro avg       0.81      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523



### Classification with Word Embedding and Deep Learning

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam

In [10]:
token = Tokenizer()
token.fit_on_texts(text)

In [11]:
vocab_size = len(token.word_index) + 1
vocab_size

22581

In [None]:
print(token.word_index)

In [13]:
encoded_text = token.texts_to_sequences(text)

In [14]:
len(encoded_text)

7613

In [15]:
max_length = 50
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [16]:
print(X)

[[ 109 4491   22 ...    0    0    0]
 [ 186   43  223 ...    0    0    0]
 [  41 1682 1435 ...    0    0    0]
 ...
 [2709 2295 4483 ...    0    0    0]
 [  75 1092   39 ...    0    0    0]
 [   2  206   55 ...    0    0    0]]


In [17]:
X.shape

(7613, 50)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# sequential model
vec_size = 100
model = Sequential()
model.add(Embedding(vocab_size, vec_size, input_length=max_length))

model.add(Conv1D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(16, activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           2258100   
                                                                 
 conv1d (Conv1D)             (None, 49, 32)            6432      
                                                                 
 max_pooling1d (MaxPooling1  (None, 24, 32)            0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 24, 32)            0         
                                                                 
 dense (Dense)               (None, 24, 32)            1056      
                                                                 
 dropout_1 (Dropout)         (None, 24, 32)            0         
                                                        

In [21]:
%%time
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 29.9 s, sys: 1.34 s, total: 31.3 s
Wall time: 58.6 s


<keras.src.callbacks.History at 0x7d0f178d13c0>

In [22]:
# prediction
x = 'it is so hot today' # example text
x = token.texts_to_sequences([x])
x = pad_sequences(x, maxlen=max_length, padding='post') # encoded text
print(x)

y_pred = np.argmax(model.predict(x), axis=-1)
print('predicted label:', y_pred)

[[ 13   9  32 188 119   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
predicted label: [0]


In [23]:
# prediction
x = 'i met you today by accident' # example text
x = token.texts_to_sequences([x])
x = pad_sequences(x, maxlen=max_length, padding='post') # encoded text
print(x)

y_pred = np.argmax(model.predict(x), axis=-1)
print('predicted label:', y_pred)

[[   7 6538   10  119   19  128    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]
predicted label: [0]


In [24]:
# prediction
x = 'i got car accident today and i am injured' # example text
x = token.texts_to_sequences([x])
x = pad_sequences(x, maxlen=max_length, padding='post') # encoded text
print(x)

y_pred = np.argmax(model.predict(x), axis=-1)
print('predicted label:', y_pred)

[[  7  93 126 128 119   8   7  31 251   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
predicted label: [0]


### Classification with BERT

ktrain (https://pypi.org/project/ktrain/0.1.6/) is a lightweight wrapper for the deep learning library Keras to help build, train, and deploy neural networks.

In [25]:
import ktrain
from ktrain import text

In [26]:
(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=df, text_column='text',
                                                                   label_columns='target', maxlen=max_length, preprocess_mode='bert')

['not_target', 'target']
      not_target  target
6299         1.0     0.0
7576         1.0     0.0
32           1.0     0.0
1724         1.0     0.0
1578         1.0     0.0
['not_target', 'target']
      not_target  target
1532         1.0     0.0
2914         1.0     0.0
7121         0.0     1.0
4183         1.0     0.0
654          0.0     1.0
preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [None]:
model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preproc)

In [28]:
learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=64)

In [29]:
learner.fit_onecycle(lr = 2e-4, epochs=3)



begin training using onecycle policy with max lr of 0.0002...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d0d9d772350>

In [30]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [31]:
data = ['i met you today by accident', 'i got car accident today and i am injured']
predictor.predict(data, return_proba=True)

array([[0.92572623, 0.07427373],
       [0.02427271, 0.9757273 ]], dtype=float32)

In [32]:
for i in data:
    pred = predictor.predict(i)
    print('text:', i)
    print('predicted class:', predictor.get_classes().index(pred))
    print('---------')

text: i met you today by accident
predicted class: 0
---------
text: i got car accident today and i am injured
predicted class: 1
---------


In [33]:
predictor.predict(data[0])

'not_target'

In [34]:
predictor.predict(data[1])

'target'