# Text classification

Based on a single text column, we want to classify each document into a single class

Here we use CNN article dataset to classify each document into correct Category.

Our main feature is the Description column. We have split dataset into training and validations sets in `prepare-dataset.ipynb` notebook (you can also run `prepare_dataset.py`)

In [1]:
# essential modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# download tokenizers and stopwords

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/wflis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/wflis/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Simpler approaches with bag of words or tf-idf

In [2]:
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("data/training_set.csv")

df['kfold'] = -1

# the next step is to randomize the rows of the data
df = df.sample(frac=1).reset_index(drop=True)

# fetch labels
y = df['Category'].values

# initiate the kfold class from model_selection module
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

## bag of words + logistic regression

In [3]:
# we go over the folds created
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    count_vec = CountVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    count_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = count_vec.transform(train_df["Description"])
    xtest = count_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = linear_model.LogisticRegression(max_iter=200, n_jobs=-1)
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8742331288343558

Fold: 1
Accuracy = 0.8803680981595092

Fold: 2
Accuracy = 0.8788343558282209

Fold: 3
Accuracy = 0.8619631901840491

Fold: 4
Accuracy = 0.8650306748466258



## TfidfVectorizer + logistic regression

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# we go over the folds created
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    tfidf_vec = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    tfidf_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df["Description"])
    xtest = tfidf_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = linear_model.LogisticRegression(max_iter=200, n_jobs=-1)
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8819018404907976

Fold: 1
Accuracy = 0.8773006134969326

Fold: 2
Accuracy = 0.8650306748466258

Fold: 3
Accuracy = 0.8604294478527608

Fold: 4
Accuracy = 0.848159509202454



## bag of words + SVM

In [5]:
from sklearn import svm

# we go over the folds created
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    count_vec = CountVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    count_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = count_vec.transform(train_df["Description"])
    xtest = count_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = svm.LinearSVC()
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8619631901840491

Fold: 1
Accuracy = 0.8788343558282209

Fold: 2
Accuracy = 0.8542944785276073

Fold: 3
Accuracy = 0.8619631901840491

Fold: 4
Accuracy = 0.8680981595092024



## tfidfVectorizer + SVM

In [6]:
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    tfidf_vec = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    tfidf_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df["Description"])
    xtest = tfidf_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = svm.LinearSVC()
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8834355828220859

Fold: 1
Accuracy = 0.8880368098159509

Fold: 2
Accuracy = 0.8819018404907976

Fold: 3
Accuracy = 0.8696319018404908

Fold: 4
Accuracy = 0.8773006134969326



## bag of words + multinomial naive bayes

In [7]:
from sklearn import naive_bayes

for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    count_vec = CountVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    count_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = count_vec.transform(train_df["Description"])
    xtest = count_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = naive_bayes.MultinomialNB()
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8803680981595092

Fold: 1
Accuracy = 0.8941717791411042

Fold: 2
Accuracy = 0.8819018404907976

Fold: 3
Accuracy = 0.8680981595092024

Fold: 4
Accuracy = 0.8742331288343558



## tf-idf + multinomial naive bayes

In [8]:
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    tfidf_vec = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    tfidf_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df["Description"])
    xtest = tfidf_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = naive_bayes.MultinomialNB()
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8696319018404908

Fold: 1
Accuracy = 0.8665644171779141

Fold: 2
Accuracy = 0.852760736196319

Fold: 3
Accuracy = 0.8588957055214724

Fold: 4
Accuracy = 0.848159509202454



# OneVsRestClassifier

In [9]:
from sklearn.multiclass import OneVsRestClassifier

for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    count_vec = CountVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    count_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = count_vec.transform(train_df["Description"])
    xtest = count_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = OneVsRestClassifier(linear_model.LogisticRegression(max_iter=200, n_jobs=-1))
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8742331288343558

Fold: 1
Accuracy = 0.8834355828220859

Fold: 2
Accuracy = 0.8803680981595092

Fold: 3
Accuracy = 0.8619631901840491

Fold: 4
Accuracy = 0.8588957055214724



In [10]:
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    tfidf_vec = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    tfidf_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df["Description"])
    xtest = tfidf_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = OneVsRestClassifier(linear_model.LogisticRegression(max_iter=200, n_jobs=-1))
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

Fold: 0
Accuracy = 0.8757668711656442

Fold: 1
Accuracy = 0.8711656441717791

Fold: 2
Accuracy = 0.8496932515337423

Fold: 3
Accuracy = 0.8604294478527608

Fold: 4
Accuracy = 0.8420245398773006



## bag of words + xgboost

In [11]:
from xgboost import XGBClassifier

for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    count_vec = CountVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    count_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = count_vec.transform(train_df["Description"])
    xtest = count_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = XGBClassifier(eval_metric='auc')
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

ModuleNotFoundError: No module named 'xgboost'

## tfidfVectorizer + xgboost

In [None]:
for fold_ in range(5):
    # temporary dataframes for train and test
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    
    # initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
    tfidf_vec = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None
    )
    # fit count_vec on training data reviews
    tfidf_vec.fit(train_df["Description"])
    # transform training and validation data reviews
    xtrain = tfidf_vec.transform(train_df["Description"])
    xtest = tfidf_vec.transform(test_df["Description"])
    # initialize logistic regression model
    model = XGBClassifier(eval_metric='auc')
    # fit the model on training data reviews and sentiment
    model.fit(xtrain, train_df["Category"])
    # make predictions on test data
    # threshold for predictions is 0.5
    preds = model.predict(xtest)
    # calculate accuracy
    accuracy = metrics.accuracy_score(test_df["Category"], preds)
    print(f"Fold: {fold_}")
    print(f"Accuracy = {accuracy}")
    print("")

## Best approaches out of these

- tf-idf + SVM
- bag of word + multinomial naive bayes
- logistic regression

# Approaches with text vectorization

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

X_train = df['Description']
y_train = df['Category']

# we need to preprocess text: split into words, remove punctuations and stopwords

transformations = [
    lambda x: x.lower(),
    lambda x: x.split(),
    lambda x: [ word for word in x if word.isalpha() ],
    lambda x: [ word for word in x if word not in stop_words ]
]

X_transformed = X_train

for t in transformations:
    X_transformed = X_transformed.apply(t)
                   
X_transformed

0       [simone, biles, withdraw, team, gymnastic, fin...
1       [one, extraordinary, scenes, nba, brooklyn, ne...
2       [valentino, one, greatest, charismatic, motorc...
3       [erratically, driven, car, threatened, lives, ...
4       [us, star, sprinter, richardson, left, roster,...
                              ...                        
3255    [spate, murders, feminist, djs, creating, safe...
3256    [band, golden, black, shared, major, wins, irr...
3257    [eileen, gu, kamila, valieva, became, teenage,...
3258    [duchess, brought, litigation, associated, new...
3259    [french, league, game, lyon, marseille, abando...
Name: Description, Length: 3260, dtype: object