# Important Libraries

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tqdm import tqdm
import re
import collections
from wordcloud import STOPWORDS
from scipy.sparse import csr_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
import string
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from wordcloud import WordCloud
import gensim
import time
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import multiprocessing

!pip install gensim
!pip install python-Levenshtein


[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading vader_lexicon: <urlopen error [WinError
[nltk_data]     10060] A connection attempt failed because the
[nltk_data]     connected party did not properly respond after a
[nltk_data]     period of time, or established connection failed
[nltk_data]     because connected host has failed to respond>




# Data Acquisition

## Follow these steps to train your model

In [2]:
train_data = pd.read_table("train.tsv")


# Text Preprocessing

In [3]:
print("Before removing redundant prices: ",train_data.shape)
train_data = train_data[train_data['price'] > 0].reset_index(drop=True)
print("After removing redundant prices: ",train_data.shape)

Before removing redundant prices:  (1482535, 8)
After removing redundant prices:  (1481661, 8)


# Basic Preprocessing Steps
   ### Lower Casing
   ### Remove HTML tags
   ### Remove Urls
   ### Remove Punctuation
   ### Chat Word Treatment
   ### Spelling Correction
   ### Removing Stop Words
   ### Handling Emojis
   ### Tokenization
   ### Stemming
   ### Lemmatization


In [4]:
#Lower Casing

#Remove HTML tags
def striphtml(data):
    data = str(data)
    p = re.compile(r'<.*?>')
    return p.sub('', data)


#Remove URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

#Sub-Categories
def handle_category(data):    
    cat1=[]
    cat2=[]
    cat3=[]
    i=0
    for row in data:
        try:
            categories=row.split('/')
        except:
            categories=['','','']
        cat1.append(categories[0])
        cat2.append(categories[1])
        cat3.append(categories[2])
        i+=1
    return cat1,cat2,cat3



train_data['name'] = train_data['name'].str.lower()
train_data['category_name'] = train_data['category_name'].str.lower()
train_data['item_description'] = train_data['item_description'].str.lower()
train_data['item_description'] = train_data['item_description'].apply(striphtml)
train_data['item_description'] = train_data['item_description'].apply(remove_url)    


c1,c2,c3=handle_category(train_data['category_name'])
train_data['sub_category1']=c1
train_data['sub_category2']=c2
train_data['sub_category3']=c3
train_data['item_description'].fillna(value='No description given',inplace=True)
train_data['brand_name'].fillna(value='Not known',inplace=True)
train_data = train_data.drop(['category_name'], axis=1)





In [5]:
# Combining all the string-based value column as one.

selected_columns = ['name', 'brand_name', 'item_description', 'sub_category1', 'sub_category2', 'sub_category3']
train_data['concatenated_description'] = train_data[selected_columns].apply(lambda x:' '.join(x), axis=1)
train_data = train_data.drop(selected_columns, axis=1)

In [6]:
#Remove Punctuations
exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('','', exclude))

train_data['concatenated_description'] = train_data['concatenated_description'].apply(remove_punc)    

In [7]:
# Chat word treatment -- ROFL, LMAO, WTH, GN, ASAP, etc

fileName = "slang.txt"
accessMode = "r"
with open(fileName, accessMode) as data:
    values = data.readlines()

dictionary = dict()

for i in values:
    key,value = i.split("=")
    dictionary[key] = value

def translator(text):
    new_text = []
    for w in text.split():
        if w.upper() in dictionary:
            new_text.append(dictionary[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


train_data['concatenated_description'] = train_data['concatenated_description'].apply(translator) 

In [8]:
# Handling Stopwords

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):

    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(text)
    # converts the words in word_tokens to lower case and then checks whether
    #they are present in stop_words or not
    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

    return " ".join(filtered_sentence)


train_data['concatenated_description'] = train_data['concatenated_description'].apply(remove_stopwords) 

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


In [10]:
# Text Normalization
# Stemming

from nltk.tokenize import RegexpTokenizer

def tokenization(text):
    tokenizer = RegexpTokenizer("[\w']+")
    l = tokenizer.tokenize(text)
    return " ".join(l)

train_data['concatenated_description'] = train_data['concatenated_description'].apply(tokenization)
train_data.to_csv('preprocessed_tokenized_training_data.csv')

# Feature Extraction from text
# Beware of the saying "Garbage in, Garbage Out"
#  -- The efficiency of your model depends on the features that you have extracted from the data. If the features are garbage, you're model despite being extremely good, will give you garbage output.