# Text classification

Based on a single text column, we want to classify each document into a single class

Here we use CNN article dataset to classify each document into correct Category.

Our main feature is the Description column. We have split dataset into training and validations sets in `prepare-dataset.ipynb` notebook (you can also run `prepare_dataset.py`)

In [8]:
# essential modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# download tokenizers and stopwords

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/wiktor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/wiktor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("data/training_set.csv")

df['kfold'] = -1

# the next step is to randomize the rows of the data
df = df.sample(frac=1).reset_index(drop=True)

# fetch labels
y = df['Category'].values

# initiate the kfold class from model_selection module
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f
    
# we go over the folds created
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    count_vec = CountVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    count_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = count_vec.transform(train_df["Description"])
    xtest = count_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = linear_model.LogisticRegression(max_iter=200, n_jobs=-1)
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8696319018404908

Fold: 1
Accuracy = 0.8726993865030674

Fold: 2
Accuracy = 0.8742331288343558

Fold: 3
Accuracy = 0.8588957055214724

Fold: 4
Accuracy = 0.8865030674846626



In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# we go over the folds created
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    tfidf_vec = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    tfidf_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df["Description"])
    xtest = tfidf_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = linear_model.LogisticRegression(max_iter=200, n_jobs=-1)
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8665644171779141

Fold: 1
Accuracy = 0.8788343558282209

Fold: 2
Accuracy = 0.8634969325153374

Fold: 3
Accuracy = 0.8573619631901841

Fold: 4
Accuracy = 0.8742331288343558



In [11]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

X_train = df['Description']
y_train = df['Category']

# we need to preprocess text: split into words, remove punctuations and stopwords

transformations = [
    lambda x: x.lower(),
    lambda x: x.split(),
    lambda x: [ word for word in x if word.isalpha() ],
    lambda x: [ word for word in x if word not in stop_words ]
]

X_transformed = X_train

for t in transformations:
    X_transformed = X_transformed.apply(t)
                   
X_transformed

0       [jersey, belonging, michael, jordan, sophomore...
1       [born, survived, cancer, owned, worked, rice, ...
2       [smoke, severe, wildfires, burning, west, coas...
3       [motto, theme, bullish, barcelona, president, ...
4       [france, speed, declassification, secret, defe...
                              ...                        
3255    [new, zealand, weightlifter, laurel, hubbard, ...
3256    [turbulent, lionel, messi, atypically, slow, s...
3257    [making, history, victory, open, earlier, coll...
3258    [toddler, found, three, days, went, missing, p...
3259    [lionel, messi, made, paris, debut, coming, se...
Name: Description, Length: 3260, dtype: object

In [13]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization