### Importing data

In [1]:
import re
import spacy
import numpy as np
import pandas as pd

nlp = spacy.load('en_core_web_lg')

In [2]:
data = pd.read_csv('D:/Study Material/NLP/Text_Classification/bbc_data.csv')
data.head(3)

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment


### Splitting data into train and validation

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train, valid = train_test_split(data, test_size=0.2, random_state=42, shuffle=True, stratify=data['labels'])

X_train = train.drop(labels='labels', axis=1)
Y_train = train.drop(labels='data', axis=1)
X_valid = valid.drop(labels='labels', axis=1)
Y_valid = valid.drop(labels='data', axis=1)

In [5]:
print(X_train.shape)
print(X_valid.shape)

(1780, 1)
(445, 1)


### Encoding labels

In [6]:
from sklearn.preprocessing import LabelEncoder

lable_encoder = LabelEncoder()

In [7]:
Y_train = lable_encoder.fit_transform(Y_train['labels'])

In [8]:
Y_valid = lable_encoder.transform(Y_valid['labels'])

### Text Pre-processing and building models

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
## defining a class for text pre-processing pipeline

class text_preprocessing(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        temp_dataframe = []
        dataframe_lowercase = pd.Series([doc.lower() for doc in X])

        for sentence in dataframe_lowercase:
            input_text = sentence.strip()
            input_text = re.sub(r'[^a-zA-Z\s]', '', input_text)
            input_text = re.sub('\s{2,}', ' ', input_text)
            doc = nlp(input_text)

            temp_text = []
            for token in doc:
                if not token.is_stop:
                    temp_token = token.lemma_
                    temp_text.append(temp_token)

            temp_dataframe.append(' '.join(temp_text))

        return temp_dataframe     

In [11]:
## building pipeline for text pre-preocessing and model building

text_clf = Pipeline([('text_processing', text_preprocessing()),
                     ('text_2_vector', TfidfVectorizer()),
                     ('classifier', RandomForestClassifier())])
text_clf.fit(X_train['data'], Y_train)

### Making predictions 

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
## prediction on training data
trn_pred = text_clf.predict(X_train['data'])
print(f'Accuracy for text classification on training data: {accuracy_score(trn_pred, Y_train)}')

Accuracy for text classification on training data: 1.0


In [14]:
## prediction on validation data
valid_prd = text_clf.predict(X_valid['data'])
print(f'Accuracy for text classification on validation data: {accuracy_score(valid_prd, Y_valid)}')

Accuracy for text classification on validation data: 0.9685393258426966


### Using Word2Vec embedding using gensim

In [15]:
## Defning a class for text pre-processing for building Word2Vec model

class w2v_text_processing(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        temp_input_text = pd.Series([doc.lower() for doc in X])
        final_text = []

        for sent in temp_input_text:
            input_text = sent.strip()
            input_text = re.sub(r'[^a-zA-Z\s]', '', input_text)
            input_text = re.sub('\s{2,}', ' ', input_text)
            token_text = nlp(input_text)

            temp_text = []
            for token in token_text:
                if not token.is_stop:
                    temp_text.append(token.lemma_)

            final_text.append(temp_text)

        return final_text

In [16]:
from gensim.models import Word2Vec

## function definition for text processing
def word_processing(text):
    temp_input_text = pd.Series([doc.lower() for doc in text])
    final_text = []

    for sentence in temp_input_text:
        input_text = sentence.strip()
        input_text = re.sub(r'[^a-zA-Z\s]', '', input_text)
        input_text = re.sub('\s{2,}', ' ', input_text)
        doc = nlp(input_text)

        temp_text = []
        for token in doc:
            if not token.is_stop:
                temp_text.append(token.lemma_)

        final_text.append(temp_text)

    return final_text

## processed text for word embedding
processed_text = word_processing(X_train['data'])

## training word embedding model on our custom data
w2v_model = Word2Vec(sentences=processed_text, vector_size=100, window=3, min_count=2)

In [17]:
## Defining a class for sentence 

class sentence_vectorization(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        sentence_vector = []
        for sentence in X:
            temp_word_vector = []
            for words in sentence:
                if words in w2v_model.wv:
                    word_vector_temp = w2v_model.wv[words]
                else:
                    word_vector_temp = np.zeros(w2v_model.vector_size)
                temp_word_vector.append(word_vector_temp)

            sentence_vector.append(np.mean(temp_word_vector, axis=0))

        return sentence_vector

### Model building

In [18]:
## Pipeline for text classification using word embedding
w2v_text_clf = Pipeline([('text_processing', w2v_text_processing()),
                         ('sentence_vectorization', sentence_vectorization()),
                         ('classifier', RandomForestClassifier(n_jobs=-1, random_state=42))])
w2v_text_clf.fit(X_train['data'], Y_train)

### Making Predictions/Text Classification

In [19]:
## Prediction on training data
X_pred = w2v_text_clf.predict(X_train['data'])
print(f'Accuracy on training data using Word2Vec: {accuracy_score(Y_train, X_pred)}')

Accuracy on training data using Word2Vec: 1.0


In [20]:
valid_pred = w2v_text_clf.predict(X_valid['data'])
print(f'Accuracy on validation data using Word2Vec: {accuracy_score(Y_valid, valid_pred)}')

Accuracy on validation data using Word2Vec: 0.9393258426966292


#### To be continued....Add Encoding/Decoding target labels in Pipeline and load a custom pre-trained word embedding model