# Inserting Dirty Data

#### This script is used to add erroneous data to the Tweets Dataset. For the experiment a sample of 1000 (rows) English records will be used. The data that will be added will be approximately 10% (or 100 rows) per error type. 9 error types will be inserted, 8 of them are based on the taxonomy described in Appendix A in the Dissertation. The errors types are:
    1. Wrong data type errors. 
    2. Negative number representation errors.
    3. Extraoneous data.
    4. Swapped row ordering.
    5. Data into wrong fields.
    6. Duplicated data.
    7. Misspelled data.
    8. Special character errors.
    9. Inconsistent data.

#### Installations

In [14]:
# For output cleaning
from IPython.display import clear_output

# Remove the comments in order to perform the necessary installations!

#!pip install numpy
#!pip install pandas
#!pip install re

# Clear the output
clear_output()

#### Imports

In [2]:
# Import the necessary libraries.
import pandas as pd
import re 

# Clear the output
clear_output()

Read Tweets file.

In [3]:
# Read the 1000 english Tweets sample.
# Change file location if needed.
df = pd.read_csv('D:\TweetCSV\Tweets1000', nrows=1651)

#### Insert wrong data type errors (2.1.1.1.1.1 in the taxonomy), String instead of Bool

In [4]:
count = 0
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    # Further split the dataframe into 4, to add different errors 
    if count <= 25: 
        # True
        if row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "T"
        # False
        elif not row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "F"
    elif count >= 26 and count <= 50:
        # True
        if row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "true"
        # False
        elif not row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "false"
    elif count >= 51 and count <= 75: 
        # True
        if row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "True"
        # False
        elif not row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "False"
    elif count >= 76 and count <= 100:
        # True
        if row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "1"
        # False
        elif not row['user_default_profile_image']:
            df.loc[index, 'user_default_profile_image'] = "0"
    count += 1

#### Insert negative number representations errors (2.2.3.1.2.2 in the taxonomy), Making numbers negative

In [5]:
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    # If the number is not 0 swap it with its negative representation 
    if row['retweet_count'] != 0:
        df.loc[index, 'retweet_count'] = -abs(row['retweet_count'])
    # If 0 retweets then swap it with -1 
    else:
        df.loc[index, 'retweet_count'] = -1

#### Inserting extraneous data (2.1.2.1.1.3 in the taxonomy)

In [6]:
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    # Add user_ids to the begining of the user_created_at column 
    combined = str(df.loc[index, 'user_id']) + ' ' + df.loc[index, 'user_created_at']
    df.loc[index, 'user_created_at'] = combined
    # Leave user_id's blank
    df.loc[index, 'user_id'] = ""

#### Swap orderings in the rows (2.2.3.2.1.3 in the taxonomy), Changing order

In [7]:
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    arr = row['created_at'].split(' ')
    arr[0], arr[1], arr[2], arr[3], arr[4], arr[5] = arr[3], arr[4], arr[5], arr[0], arr[1], arr[2]
    df.loc[index, 'created_at'] =  ' '.join(arr)

#### Inserting data into wrong fields (2.1.2.1.2.1 in the taxonomy)

In [8]:
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    # Swap the source annd created_at entries 
    temp = df.loc[index, 'source']
    df.loc[index, 'source'] = df.loc[index, 'created_at']
    df.loc[index, 'created_at'] = temp

#### Inserting duplicated data (2.1.1.1.1.3 in the taxonomy)

In [9]:
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    if index != 0 and index != 1 and index != 2:
        df.loc[index, 'user_id'] = df.loc[index - 2, 'user_id']
    else:
        df.loc[index, 'user_id'] = df.loc[index + 2, 'user_id']

#### Inserting misspellings (2.1.2.1.1.2 in the taxonomy), Adding misspelled data

In [10]:
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    s = df.loc[index, 'user_created_at'].split()
    # Normal date time 
    if len(s) == 6:
        # Monday
        if s[0] == "Mon":
            s[0] = "Man"
        # Tuesday
        elif s[0] == "Tue":
            s[0] = "Tur"
        # Wednesday
        elif s[0] == "Wed":
            s[0] = "Wid"
        # Thursday
        elif s[0] == "Thu":
            s[0] = "Tha"
        # Friday
        elif s[0] == "Fri":
            s[0] = "Fra"
        # Saturday
        elif s[0] == "Sat":
            s[0] = "Set"
        # Sunday
        elif s[0] == "Sun":
            s[0] = "Sen"
    # Date time with user_id
    else:
        # Monday
        if s[1] == "Mon":
            s[1] = "Man"
        # Tuesday
        elif s[1] == "Tue":
            s[1] = "Tur"
        # Wednesday
        elif s[1] == "Wed":
            s[1] = "Wid"
        # Thursday
        elif s[1] == "Thu":
            s[1] = "Tha"
        # Friday
        elif s[1] == "Fri":
            s[1] = "Fra"
        # Saturday
        elif s[1] == "Sat":
            s[1] = "Set"
        # Sunday
        elif s[1] == "Sun":
            s[1] = "Sen"
    
    df.loc[index, 'user_created_at'] =  ' '.join(s)

#### Inserting special characters (2.2.3.2.1.2 in the taxonomy), Adding special characters to the data

In [11]:
# Function to split the number 
def split(word):
    return list(word)

# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    if len(str(row['user_favourites_count'])) > 1:
        s = str(row['user_favourites_count'])
        length = 0
        arr = split(s)
        
        # Get middle of array
        if len(arr) % 2 != 0:
            length = int(len(arr) / 2 - .5)
        else:
            length = int(len(arr) / 2)
        
        # Insert special character near the middle
        arr.insert(length, "-")
        
        df.loc[index, 'user_favourites_count'] = "".join(arr)

#### Inserting inconsistent data (not in the taxonomy)

In [12]:
count = 0
# Loop through a sample of the dataframe
# The sample will be with random rows and only 10% of the dataframe (100 rows)
for index, row in df.sample(frac = 0.1).iterrows():
    if count <= 25: 
        if row['lang']:
            df.loc[index, 'lang'] = "ENGLISH"
    elif count >= 26 and count <= 50:
        if row['lang']:
            df.loc[index, 'lang'] = "English"
    elif count >= 51 and count <= 75: 
        if row['lang']:
            df.loc[index, 'lang'] = "eng"
    elif count >= 76 and count <= 100:
        if row['lang']:
            df.loc[index, 'lang'] = "english"
    # Increment count        
    count += 1

#### Save the dataframe to an excel file.

In [13]:
# Save
# Change file location if needed.
df.to_excel("D:\DirtyTweetsDataset.xlsx")