In [None]:
from __future__ import print_function
#if nltk is not installed uncomment the next line
#!{sys.executable} -m pip install nltk
import sklearn
import sklearn.pipeline as sp
import sklearn.datasets as sd
import sklearn.ensemble as se
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import random
import pandas
import re
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import time
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Concate the Datasets and create new .csv file

In [None]:
#load all Datasets
df1 = pandas.read_csv('train_40k.csv', header = 0, encoding = 'utf-8')
df2 = pandas.read_csv('reviews.csv', header = 0, encoding = 'utf-8')
df3 = pandas.read_json('Pet_Supplies_5.json', lines=True)

In [None]:
#drop all columns except from Text and Label
df1.drop(df1.columns.difference(['Text', 'Cat1']), 1, inplace=True)
df1.columns = ['text','label']
df2.drop(df2.columns.difference(['Text']), 1, inplace=True)
df3.drop(df3.columns.difference(['reviewText']), 1, inplace=True)

#get the first 2300 rows 
df2 = df2.head(2300)
df2.columns = ['text']
df2['label'] = 'grocery gourmet food'

#get the first 1050 rows
df3 = df3.head(1050)
df3.columns = ['text']
df3['label'] = 'pet supplies'

#dataset for plot
df4 = pandas.concat([df1, df2, df3])

#delete the first 4247 rows with label toys games
indexNames_tg = df1[ df1['label'] == 'toys games'].index
indexNames_tg = indexNames_tg[:4247]
df1.drop(indexNames_tg, inplace=True)

#delete the first 3600 rows with label health personal care
indexNames_hpc = df1[ df1['label'] == 'health personal care'].index
indexNames_hpc = indexNames_hpc[:3600]
df1.drop(indexNames_hpc, inplace=True)

#concate the Datasets and shuffle the new one
df = pandas.concat([df1, df2, df3])
df = df.sample(frac=1)
#save the Dataset as a csv file
df.to_csv('concate_dataset_unclean.csv')

# Load Dataset and Data Quality Dimensions

In [None]:
#load our merged Dataset
df = pandas.read_csv('concate_dataset_unclean.csv', header = 0, encoding = 'utf-8')

In [None]:
#unbalanced Dataset
df4_plot = pandas.DataFrame({'lab':['tg', 'hpc', 'ggf', 'ps', 'b', 'bp'], \
                   'val':[10266, 9772, 5917, 5912, 5846, 5637]})
ax = df4_plot.plot.bar(x='lab', y='val', rot=0)

In [None]:
#Completeness
value_nan = df["text"].isna().sum().item()
value_empty = len(df.index[df['text'] == ""].tolist())
print('Completeness before Cleaning: ', (1 - (value_nan + value_empty)/len(df))*100,"%")

In [None]:
#Uniqueness
duplicateRowsDF = len(df[df.duplicated(['text'])])
print('Uniqueness before Cleaning: ', (1 - (duplicateRowsDF / len(df)))*100, "%")

In [None]:
#Timeliness
from datetime import date, datetime, time
date_start = date(2020, 6, 3)
date_ds1 = date(2020, 4, 2)
date_ds2 = date(2017, 5, 1)
date_ds3 = date(2016, 4, 26)
print('Timeliness Datensatz[3]: ', date_start - date_ds1)
print('Timeliness Datensatz[4]: ', date_start - date_ds2)
print('Timeliness Datensatz[5]: ', date_start - date_ds3)

In [None]:
#Validity
print('Validity before Cleaning: ', ((2300+1050)/(39489-4247-3600)) * 100, "%")

In [None]:
#Accuracy
df2 = df[df['text'].duplicated() == True]
text_dup = df2['text'].unique().tolist()
counter = 0
for i in text_dup:
    df_unique = df2.loc[df2['text'] == i]
    if len(df_unique['label'].unique().tolist()) > 1:
        counter += len(df_unique['label'].unique().tolist())
print('Accuracy before Cleaning: ', (1 - (counter / (len(df)))) * 100, '%')

In [None]:
#Consistency
print('Consistency before Cleaning: ', (1 - ((2300+1050)/(39489-4247-3600))) * 100, "%")

# Major Data Cleaning

In [None]:
#load our merged Dataset
df = pandas.read_csv('concate_dataset_unclean.csv', header = 0, encoding = 'utf-8')

In [None]:
#translate the dataset into english
#not necessary for our dataset / needs a lot of time
#if translate is not installed uncomment the next line
#!{sys.executable} -m pip install googletrans
import time
from googletrans import Translator
translator = Translator()
t01 = time.time()
for i in range(len(df)):
    if (translator.detect(df['text'].values[i]).lang) != 'en':
        df['text'].values[i] = (translator.translate(df['text'].values[i])).text
t02 = time.time()
print(t02-t01)

In [None]:
#shortform in longform and remove Punctuation
#Punctuations have low information value
#Therefore we removed them

t01 = time.time()
def f1(text_new):
    text_new = re.sub(r"won\'t", "will not", text_new)
    text_new = re.sub(r"can\'t", "can not", text_new)
    text_new = re.sub(r"\'m", " am", text_new)
    text_new = re.sub(r"\'re", " are", text_new)
    text_new = re.sub(r"\'ve", " have", text_new)
    text_new = re.sub(r"n\'t", " not", text_new)
    text_new = re.sub(r"\'d", " would", text_new)
    text_new = re.sub(r"\'ll", " will", text_new)
    text_new = re.sub(r"\'t", " not", text_new)
    text_new = re.sub(r"\'s", "", text_new)
    text_new = re.sub(r"\d+", "", text_new)
    return text_new

for i in range(0, len(df)):
    df['text'].values[i] = df['text'].values[i].translate(str.maketrans('', '', string.punctuation))
    df['text'].values[i] = f1(df['text'].values[i])
    
t02 = time.time()
print(t02-t01)

In [None]:
#lower case
#Unfortunately stopword only recognized lower case words
#Therefore we had to apply this method before Stopwords
t01 = time.time()
for i in range(len(df)):
    df['text'].values[i] = (df['text'].values[i]).lower()
    
t02 = time.time()
print(t02-t01)

In [None]:
#remove StopWords
#remove with low information value
#if gensim is not installed uncomment the next line
#!{sys.executable} -m pip install gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords
all_stopwords_gensim = STOPWORDS

t01 = time.time()
for i in range(len(df)):
    text_tokens = word_tokenize(df['text'].values[i])
    tokens_without_sw = [word for word in text_tokens if not word in all_stopwords_gensim]
    df['text'].values[i] = (" ").join(tokens_without_sw)
t02 = time.time()
print(t02-t01)

In [None]:
#Reference: https://medium.com/@SeoJaeDuk/basic-data-cleaning-engineering-session-twitter-sentiment-data-b9376a91109b
#Lemmatization
#transforms the word into its word stem
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
t01 = time.time()

df['text'] = df['text'].apply(lambda x: ' '.join([lmtzr.lemmatize(word, 'v') for word in x.split()]))
t02 = time.time()
print(t02-t01)

In [None]:
#Reference: https://medium.com/@SeoJaeDuk/basic-data-cleaning-engineering-session-twitter-sentiment-data-b9376a91109b
#Stemming
#cuts off the end of words. It was way more efficient than Lemmatization.
#Even though the functionalities seem similair
from nltk.stem import PorterStemmer
ps = PorterStemmer()
t01 = time.time()

df['text'] = df['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
t02 = time.time()
print(t02-t01)

In [None]:
#Reference: https://medium.com/@SeoJaeDuk/basic-data-cleaning-engineering-session-twitter-sentiment-data-b9376a91109b
#not necessary for our Classification
#Parts of Speech Tagging which we did not use for this project
nltk.download('averaged_perceptron_tagger')
df['text_pos'] = df['text'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))

In [None]:
#SARCASM
#Here we would implement a NLP method for sarcasm, 
#but unfortunately we could not find a solution for that
#It is probably not possible at this point of time

#EMOJI
#For our first Twitter Dataset we implemented a Function to translate
#emojis to unicode, which we did not need for this Dataset

#NORMALIZATION
#Another NLP Method one could add in the future is the normalization,
#which transforms the most common abbreviations into their written out form

In [None]:
#Drop Duplicates, remove Nan and empty strings
#Important for Data Quality Dimensions
df = df[df.text != ""]
df.drop(df.columns.difference(['text', 'label']), 1, inplace=True)
df = df.replace(np.nan, '', regex=True)
df = df.drop_duplicates(subset ="text", keep = "first")

In [None]:
#load the Dataset in a csv file
df.to_csv('concate_dataset_clean.csv')

# Loading the cleaned Dataset

In [None]:
#Data Cleaning with all NLPs
df = pandas.read_csv('concate_dataset_clean.csv', header = 0, encoding = 'utf-8')
#Without Data Cleaning
#df = pandas.read_csv('concate_dataset_unclean.csv', header = 0, encoding = 'utf-8')
#Data Cleaning with Punctuation NLP
#df = pandas.read_csv('concate_dataset_clean_sz.csv', header = 0, encoding = 'utf-8')
#Data Cleaning with Lower Case NLP
#df = pandas.read_csv('concate_dataset_clean_lc.csv', header = 0, encoding = 'utf-8')
#Data Cleaning with Stopwords NLP
#df = pandas.read_csv('concate_dataset_clean_sw.csv', header = 0, encoding = 'utf-8')
#Data Cleaning with Lemmatization NLP
#df = pandas.read_csv('concate_dataset_clean_lemma.csv', header = 0, encoding = 'utf-8')
#Data Cleaning with Stemming NLP
#df = pandas.read_csv('concate_dataset_clean_stemming.csv', header = 0, encoding = 'utf-8')

# Data Quality Dimensions

In [None]:
#fair balancing
df_plot = pandas.DataFrame({'lab':['tg', 'hpc', 'b', 'bp', 'ps', 'ggf'], \
                   'val':[5999, 5996, 5720, 5720, 5863, 5886]})
ax = df_plot.plot.bar(x='lab', y='val', rot=0)

In [None]:
#Completeness
value_nan = df["text"].isna().sum().item()
value_empty = len(df.index[df['text'] == ""].tolist())
print('Completeness after Data Cleaning: ', (1 - (value_nan + value_empty)/len(df))*100,"%")

In [None]:
#Uniqueness
duplicateRowsDF = len(df[df.duplicated(['text'])])
print('Uniqueness after Data Cleaning', (1 - (duplicateRowsDF / len(df)))*100, "%")

In [None]:
#Timeliness
from datetime import date, datetime, time
date_start = date(2020, 6, 3)
date_ds1 = date(2020, 4, 2)
date_ds2 = date(2017, 5, 1)
date_ds3 = date(2016, 4, 26)
print('Timeliness Datensatz[3]: ', date_start - date_ds1)
print('Timeliness Datensatz[4]: ', date_start - date_ds2)
print('Timeliness Datensatz[5]: ', date_start - date_ds3)

In [None]:
#Validity
print('Validity after Cleaning: ', ((len(df))/(len(df))) * 100, "%")

In [None]:
#Accuracy
df2 = df[df['text'].duplicated() == True]
text_dup = df2['text'].unique().tolist()
counter = 0
for i in text_dup:
    df_unique = df2.loc[df2['text'] == i]
    if len(df_unique['label'].unique().tolist()) > 1:
        counter += len(df_unique['label'].unique().tolist())
print('Accuracy after Cleaning: ', (1 - (counter / (len(df)))) * 100, '%')

In [None]:
#Consistency
print('Consistency after Cleaning: ', ((len(df))/(len(df))) * 100, "%")

# Fill Train and Testdatasets

In [None]:
#drop all columns except from text and label and delete all nan
df.drop(df.columns.difference(['text', 'label']), 1, inplace=True)
df = df.replace(np.nan, '', regex=True)
#list for features
data_text = []
#list for labels
data_labels = []

for i in range(0, len(df)):                                  
    data_text.append(df['text'].values[i])
    data_labels.append(df['label'].values[i])
    
#fill the train and datasets
train_X = data_text[:round((0.8*len(data_text)))]
train_Y = data_labels[:round((0.8*len(data_labels)))]
test_X = data_text[round((0.8*len(data_text))):]
test_Y = data_labels[round((0.8*len(data_labels))):]

# Vectorize Train and Test Datasets

In [None]:
tfidf = TfidfVectorizer(lowercase=False)
trainset_2_vectors = tfidf.fit_transform(train_X)
testset_2_vectors = tfidf.transform(test_X)

# Train and Fit Models

In [None]:
import time
t0 = time.time()
#Initialize and train GradientBoostingClassifier with 100 estimators (GBR(100))
gbc = se.GradientBoostingClassifier(n_estimators=100, learning_rate = 0.1,
     max_depth=5, random_state=0).fit(trainset_2_vectors, train_Y)
y_pred_gbc = gbc.predict(testset_2_vectors)
t1 = time.time()
print('Time for GradientBoostingClassifier: ', t1-t0)

t01 = time.time()
#Initialize and train Random Forest with 100 estimators (RFC(100))
random_forest_algo = se.RandomForestClassifier(n_estimators=\
                        100).fit(trainset_2_vectors, train_Y)
y2_pred = random_forest_algo.predict(testset_2_vectors)
t11 = time.time()
print('Time for Random Forest: ', t11-t01)

In [None]:
#Accuracy from each Classifier
print("Accuracy from GradientBoostingClassifier: ", sklearn.metrics.f1_score(test_Y, y_pred_gbc, average='weighted'))
print("Accuracy from Random Forest: ", sklearn.metrics.f1_score(test_Y, y2_pred, average='weighted'))

# Explanation with LIME in AIX360

In [None]:
#LIME
#With Lime we can explain why the class was classified
#create a pipeline for each of the Classifier
pipeline_rf = sklearn.pipeline.make_pipeline(tfidf, random_forest_algo)

In [None]:
#if aix360 is not installed uncomment the next line
#!{sys.executable} -m pip install aix360
from aix360.algorithms.lime import LimeTextExplainer
#Explain text classifiers with LimeTextExplainer
limeexplainer = LimeTextExplainer(class_names = sorted(df.label.unique()))
x = random.randint(0,(len(train_X)))
#explanation of the assignment of a specific data point for Random Forest
ex_rf = limeexplainer.explain_instance(train_X[x], pipeline_rf.predict_proba, top_labels=7, num_features=5)
ex_rf.show_in_notebook(text = train_X[x])