<a href="https://colab.research.google.com/github/ShreyasKadiri/Natural-Language-Processing/blob/main/Text_Classification_using_spaCy_v3_0_transformers_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import all required libraries
import spacy
import random
import time
import numpy as np
import pandas as pd
import re
import string


import sys
from spacy import displacy

from tqdm.auto import tqdm
from spacy.tokens import DocBin

In [2]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)
 # converting return value from list to string



def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()

In [3]:
def make_docs(file_path):
    """
    this will take a list of texts and labels 
    and transform them in spacy documents
    
    data: list(tuple(text, label))
    
    returns: List(spacy.Doc.doc)
    """
    train_data = pd.read_csv(file_path)
    train_data.dropna(axis = 0, how ='any',inplace=True) 
    train_data['Num_words_text'] = train_data['text'].apply(lambda x:len(str(x).split())) 
    mask = train_data['Num_words_text'] >2
    train_data = train_data[mask]
    print(train_data['sentiment'].value_counts())
    
    train_data['text'] = train_data['text'].apply(remove_emoji)
    train_data['text'] = train_data['text'].apply(remove_url)
    train_data['text'] = train_data['text'].apply(clean_text)
   
    data = tuple(zip(train_data['text'].tolist(), train_data['sentiment'].tolist())) 
    print(data[1])
    docs = []
    # nlp.pipe([texts]) is way faster than running 
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple, 
    # the first one is treated as text
    # the second one will get returned as it is.
    nlp = spacy.load("en_core_web_trf")
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        # we need to set the (text)cat(egory) for each document
        #print(label)
        if (label=='positive'):
            doc.cats['positive'] = 1
            doc.cats['negative'] = 0
            doc.cats['neutral']  = 0
        elif (label=='negative'):
            doc.cats['positive'] = 0
            doc.cats['negative'] = 1
            doc.cats['neutral']  = 0
        else:
            doc.cats['positive'] = 0
            doc.cats['negative'] = 0
            doc.cats['neutral']  = 1
        #print(doc.cats)
        
        # put them into a nice list
        docs.append(doc)
    
    return docs,train_data

In [8]:
train_docs,train_data  = make_docs("C:\\TweetSenitment\\train.csv")
# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./textcat_data/textcat_train.spacy")

test_docs,test_data  = make_docs("C:\\TweetSenitment\\test.csv")
# then we save it in a binary file to disc
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("./textcat_data/textcat_valid.spacy")

In [8]:
!python -m spacy init fill-config ./textcat_base_config.cfg ./textcat_config.cfg

In [8]:
!python -m spacy train textcat_config.cfg --verbose --output ./textcat_output --paths.train textcat_data/textcat_train.spacy --paths.dev textcat_data/textcat_valid.spacy

In [8]:
nlp_textcat = spacy.load("textcat_output/model-best")
test_texts = test_data['text'].tolist()
test_cats = test_data['sentiment'].tolist()
doc2 = nlp_textcat(test_texts[100])
print("Text: "+ test_texts[100])
print("Orig Cat:"+ test_cats[100])
print(" Predicted Cats:") 
print(doc2.cats)
print("=======================================")
doc2 = nlp_textcat(test_texts[1000])
print("Text: "+ test_texts[1000])
print(" Orig Cat:"+test_cats[1000])
print(" Predicted Cats:") 
print(doc2.cats)

In [8]:
doc2 = nlp_textcat("Avengers Endgame was a great movie")
print(doc2.cats)

In [None]:
doc2 = nlp_textcat("Data science is tough to master")
print(doc2.cats)