# Identifying and removing class noise

This script identifies noisy datapoints and removes them from the data set. A noisy data point is defined here as a data point which is duplicated and the duplicate is labelled with a different class. 

In [None]:
import pandas as pd

In [None]:
# access google drive with code
# we import the library that helps us to connect to Google Drive
from google.colab import drive

# we connect to the google drive
drive.mount('/content/gdrive/')

# and we enter the folder where I stored the data
%cd '/content/gdrive/My Drive/ds/'

Mounted at /content/gdrive/
/content/gdrive/My Drive/ds


In [None]:
%pwd

'/content/gdrive/MyDrive/ds'

In [None]:
# first we read the file to analyze for the presence of noise
dfData = pd.read_csv('./book_chapter_4_embedded_1k_reviews.csv')

dfData.head(5)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined,babbage_similarity,babbage_search
0,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...,"[-0.005302980076521635, 0.018141526728868484, ...","[-0.014629893936216831, 0.026541460305452347, ..."
1,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...,"[-0.0055375657975673676, 0.013543304055929184,...","[-0.013880345970392227, 0.02056480571627617, -..."
2,1219017600,B000LQOCH0,ABXLMWJIXXAIN,4,"""Delight"" says it all",This is a confection that has been around a fe...,"Title: ""Delight"" says it all; Content: This is...","[-0.003390523372218013, 0.015841200947761536, ...","[0.0022454208228737116, 0.028257180005311966, ..."
3,1307923200,B000UA0QIQ,A395BORC6FGVXV,2,Cough Medicine,If you are looking for the secret ingredient i...,Title: Cough Medicine; Content: If you are loo...,"[-0.019539982080459595, 0.03985346108675003, -...","[-0.01540567446500063, 0.0527786985039711, 0.0..."
4,1350777600,B006K2ZZ7K,A1UQRSCLF8GW1T,5,Great taffy,Great taffy at a great price. There was a wid...,Title: Great taffy; Content: Great taffy at a ...,"[-0.022796129807829857, -0.011814728379249573,...","[-0.03182186186313629, -0.004497968591749668, ..."


In [None]:
# now, let's check if there are any duplicate entries

# get the number of all data points
allDataPoints = len(dfData.Summary)             

# and get the number of unique data points
uniqueDataPoints = len(dfData.Summary.unique())

# check if the number of unique and all data points is the same
if allDataPoints != uniqueDataPoints:
  print(f'There are {allDataPoints - uniqueDataPoints} duplicate entries, which can potentially be noisy')

There are 41 duplicate entries, which can potentially be noisy


In [None]:
# then, we find duplicate data points

# first we group the datapoints
dfGrouped = dfData.groupby(by=dfData.Summary).count()

# then we find the index of the ones that are not unique
lstDuplicated = dfGrouped[dfGrouped.Time > 1].index.to_list()

In [None]:
# a simple solution would be to remove them
# as the duplicates can contain noise
dfClean = dfData[~dfData.Summary.isin(lstDuplicated)]

# check that the length of the dataset is shorter by these instances
print(len(dfClean.Summary))

# as we can see, the number of instances is lower that the 41 instances identified. 
# this is simply because several of the instances have several duplicates that were
# removed from this dataset

934


In [None]:
# for each of these data points, we check if these data points
# are classified to different labels adn remove only the ones that have different labels
for onePoint in lstDuplicated:
  # find all instances of this datapoint
  dfPoint = dfData[dfData.Summary == onePoint]

  # now check if these data points have a different score
  numLabels = len(dfPoint.Score.unique())

  # if the number of labels is more than 1, then 
  # this means that we have noise in the dataset
  # and we should remove this point
  if numLabels > 1:
    dfData.drop(dfData[dfData.Summary == onePoint].index, inplace=True)

    # let's also print the data point that we remove
    print(f'point: {onePoint}, number of labels: {len(dfPoint.Score.unique())}')

point: Delicious, number of labels: 2
point: Delicious!, number of labels: 2
point: Disappointed!, number of labels: 2
point: Good chips, number of labels: 2
point: Great, number of labels: 2
point: Kettle Chips, number of labels: 2
point: Tasty!, number of labels: 2
point: Yummy chips, number of labels: 2
point: delicious, number of labels: 2
point: disappointing, number of labels: 2


In [None]:
# finally, let's print the number of remaining data points
# which do not contain noise
print(f'Clean data points: {len(dfData.Summary)}')

Clean data points: 970
