In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import shutil,time,os
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
from tqdm import tqdm
# NLP
import string, re, nltk
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
# !pip install num2words
from num2words import num2words
# !pip install pyspellchecker
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer
import spacy
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

# Scipy
import scipy
from scipy import sparse
from scipy.sparse import csr_matrix

# Train-test split and cross validation
from sklearn.model_selection import train_test_split, ParameterGrid

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

# Model evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score

# Others
import json
# import gensim
from sklearn.decomposition import TruncatedSVD


Can't initialize NVML


CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:109.)



# Load Data

In [2]:
input_data=pd.read_csv("./ecommerceDataset.csv",names=["Label","describtion"])
input_data.head(2)

Unnamed: 0,Label,describtion
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."


In [3]:
input_data.isna().sum()

Label          0
describtion    1
dtype: int64

In [4]:
input_data.shape

(50425, 2)

In [5]:
input_data.dropna(inplace=True)

In [6]:
input_data.isna().sum()

Label          0
describtion    0
dtype: int64

In [7]:
input_data.shape

(50424, 2)

# Remove Duplicate

In [8]:
input_data.drop_duplicates(subset="describtion",inplace=True)

In [9]:
input_data.shape

(27802, 2)

# Target Column Encoding 

In [10]:
from sklearn.preprocessing import LabelEncoder
label_encode=LabelEncoder()
input_data["Label"]=label_encode.fit_transform(input_data["Label"])

In [11]:
input_data["Label"].value_counts()

Label
3    10564
0     6256
1     5674
2     5308
Name: count, dtype: int64

# Pre-Processing

In [12]:
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

# Train Test and Val Split

In [13]:
x,y=input_data.drop("Label",axis=1),input_data["Label"]
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.20,random_state=40,stratify=y)
X_val,X_test,Y_val,Y_test=train_test_split(X_test,Y_test,test_size=0.50,random_state=40,stratify=Y_test)

# SpaCy Package

In [22]:
nlp_pipeline=spacy.load("en_core_web_sm")
# nlp_pipeline.pipeline

# Bag of Words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [16]:
c_vect=CountVectorizer(ngram_range=(1,1),lowercase=True)

In [17]:
X_train_vect=c_vect.fit_transform([preprocess_text(text) for text in X_train.describtion]).toarray()


# Model Training 

In [18]:
from sklearn.naive_bayes import CategoricalNB,MultinomialNB

In [19]:
cat_nb=CategoricalNB()
mul_nb=MultinomialNB()

In [20]:
cat_nb.fit(X_train_vect,Y_train)
mul_nb.fit(X_train_vect,Y_train)

In [21]:
mul_nb.predict(X_train_vect)

array([3, 1, 3, ..., 2, 1, 3])

# Model Evaluation

In [23]:
from sklearn.metrics import classification_report,confusion_matrix

In [24]:
X_test_vect=c_vect.transform([preprocess_text(text) for text in X_test.describtion]).toarray()
X_val_vect=c_vect.transform([preprocess_text(text) for text in X_val.describtion]).toarray()

In [25]:
X_test_vect.shape,X_val_vect.shape,X_train_vect.shape

((2781, 65366), (2780, 65366), (22241, 65366))

In [26]:
print(classification_report(Y_test,mul_nb.predict(X_test_vect)))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94       626
           1       0.96      0.98      0.97       567
           2       0.93      0.94      0.93       531
           3       0.94      0.95      0.94      1057

    accuracy                           0.95      2781
   macro avg       0.95      0.95      0.95      2781
weighted avg       0.95      0.95      0.95      2781



In [27]:
confusion_matrix(Y_test,mul_nb.predict(X_test_vect))

array([[ 577,   11,    7,   31],
       [   0,  557,    3,    7],
       [   8,    0,  497,   26],
       [  11,   14,   29, 1003]])

In [28]:
from sklearn.metrics import classification_report
print(classification_report(Y_val,mul_nb.predict(X_val_vect)))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92       625
           1       0.96      0.96      0.96       568
           2       0.92      0.91      0.91       531
           3       0.92      0.94      0.93      1056

    accuracy                           0.93      2780
   macro avg       0.93      0.93      0.93      2780
weighted avg       0.93      0.93      0.93      2780



# TF-IDF

In [None]:
tfidf=TfidfVectorizer()
X_train_tfidf=tfidf.fit_transform(X_train.describtion)

In [None]:
mul_nb.fit(x_train_tfidf,Y_train)

In [None]:
X_test_tfidf=tfidf.transform(X_test.describtion).toarray()
X_val_tfidf=tfidf.transform(X_val.describtion).toarray()

In [None]:
X_test_tfidf.shape,X_val_tfidf.shape,X_train_tfidf.shape

In [None]:
print(classification_report(Y_test,mul_nb.predict(X_test_tfidf)))

In [None]:
print(classification_report(Y_val,mul_nb.predict(X_val_tfidf)))

In [None]:
# x_train_tfidf.toarray()[0][x_train_tfidf.toarray()[0]!=0]

In [None]:
# dir(tfidf)

# Data Preprocessing

## Converting to lowercase

In [None]:
def to_lowercase(text):
    return text.lower()

## Removal of Whitespaces

In [None]:
def remove_whitespcae(text):
    return text.strip()

## Removing punctuations

In [None]:
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "")
    return text.translate(str.maketrans("", "", punct_str))

## Removing HTML tags

In [None]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

## Removing emojis

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

## Removing other unicode characters

In [None]:

def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, "", text)

## Substitution of Acronyms

In [None]:
acronyms_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json'
acronyms_dict=pd.read_json(acronyms_url,typ="series")
acronyms_data=pd.DataFrame(acronyms_dict.items(),columns=["acronym","original"])
acronyms_list = list(acronyms_dict.keys())
acronyms_data.head(5)

In [None]:
def convert_acronyms(text):
    words = []
    for word in regexp.tokenize(text):
        if word in acronyms_list:
            words = words + acronyms_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

## Substitution of Contractions

In [None]:
# Dictionary of contractions
contractions_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json'
contractions_dict = pd.read_json(contractions_url, typ = 'series')

# print("Example: Original form of the contraction 'aren't' is '{}'".format(contractions_dict["aren't"]))
contractions_list = list(contractions_dict.keys())

# Function to convert contractions in a text
def convert_contractions(text):
    words = []
    for word in regexp.tokenize(text):
        if word in contractions_list:
            words = words + contractions_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stops = stopwords.words("english") # stopwords
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
allstops = stops + addstops

In [None]:
# Function to remove stopwords from a list of texts
def remove_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])

In [None]:
spell = SpellChecker()

def pyspellchecker(text):
    word_list = regexp.tokenize(text)
    word_list_corrected = []
    for word in word_list:
        if word in spell.unknown(word_list):
            word_corrected = spell.correction(word)
            if word_corrected == None:
                word_list_corrected.append(word)
            else:
                word_list_corrected.append(word_corrected)
        else:
            word_list_corrected.append(word)
    text_corrected = " ".join(word_list_corrected)
    return text_corrected

In [None]:
# Stemming
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem

In [None]:
# Lemmatization
# spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
lemmatizer = WordNetLemmatizer()

def text_lemmatizer(text):
#     text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    text_wordnet = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]) # regexp.tokenize(text)
#     return text_spacy
    return text_wordnet

In [None]:
#  Discardment of non-alphabetic words
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha

In [None]:
def keep_pos(text):
    tokens = regexp.tokenize(text)
    tokens_tagged = nltk.pos_tag(tokens)
    #keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
    keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW', 'PRP', 'PRPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WPS', 'WRB']
    keep_words = [x[0] for x in tokens_tagged if x[1] in keep_tags]
    return " ".join(keep_words)

In [None]:
# Additional stopwords

alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
prepositions = ["about", "above", "across", "after", "against", "among", "around", "at", "before", "behind", "below", "beside", "between", "by", "down", "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on", "out", "over", "through", "to", "toward", "under", "up", "with"]
prepositions_less_common = ["aboard", "along", "amid", "as", "beneath", "beyond", "but", "concerning", "considering", "despite", "except", "following", "like", "minus", "onto", "outside", "per", "plus", "regarding", "round", "since", "than", "till", "underneath", "unlike", "until", "upon", "versus", "via", "within", "without"]
coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
correlative_conjunctions = ["both", "and", "either", "or", "neither", "nor", "not", "only", "but", "whether", "or"]
subordinating_conjunctions = ["after", "although", "as", "as if", "as long as", "as much as", "as soon as", "as though", "because", "before", "by the time", "even if", "even though", "if", "in order that", "in case", "in the event that", "lest", "now that", "once", "only", "only if", "provided that", "since", "so", "supposing", "that", "than", "though", "till", "unless", "until", "when", "whenever", "where", "whereas", "wherever", "whether or not", "while"]
others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
additional_stops = alphabets + prepositions + prepositions_less_common + coordinating_conjunctions + correlative_conjunctions + subordinating_conjunctions + others

def remove_additional_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])

In [None]:
regexp = RegexpTokenizer("[\w']+")
def text_normalizer(text):
    text = to_lowercase(text)
    text = remove_whitespcae(text)
    text = re.sub('\n' , '', text) # converting text to one line
    text = re.sub('\[.*?\]', '', text) # removing square brackets
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = convert_acronyms(text)
    text = convert_contractions(text)
    text = remove_stopwords(text)
    text = pyspellchecker(text)
    text = text_lemmatizer(text) # text = text_stemmer(text)
    text = discard_non_alpha(text)
    text = keep_pos(text)
    text = remove_additional_stopwords(text)
    return text

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
text = "We'll combine all functions into 1 SINGLE FUNCTION 🙂 & apply on @product #descriptions https://en.wikipedia.org/wiki/Text_normalization"
print("Input: {}".format(text))
print("Output: {}".format(text_normalizer(text)))

In [None]:
X_train.head(5)

In [None]:
Normalized_X_trainlst=[]
for s_rows in tqdm(X_train.iterrows()):
    Normalized_X_trainlst.append(text_normalizer(s_rows[1][0]))
    

In [None]:
# import concurrent.futures

# def apply_normalizer(df):
#     df["normalized_describtion"] = df["describtion"].apply(text_normalizer)
#     return df

# # Create a list of data frames to process
# data_frames = [X_train, X_test, X_val]

# # Use ThreadPoolExecutor for multithreading
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     normalized_dfs = list(executor.map(apply_normalizer, data_frames))

# # Retrieve the normalized data frames
# Normalized_X_train, Normalized_X_test, Normalized_X_val = normalized_dfs

In [None]:
# Normalized_X_train["normalized_describtion"]=X_train["describtion"].apply(text_normalizer)
# Normalized_X_test["normalized_describtion"]=X_test["describtion"].apply(text_normalizer)
# Normalized_X_val["normalized_describtion"]=X_val["describtion"].apply(text_normalizer)


In [None]:
import threading