# Data Preprocessing and Understanding

## Set Workspace

In [None]:
import numpy as np
import pandas as pd

In [None]:
pd.options.display.max_colwidth=None
pd.options.display.max_rows=None

## Preprocessing Data

In [None]:
# The following codes are used in the dataset
codes = ['[PDF][PDF]', '[BOOK][B]', '[HTML][HTML]', '[DOC][DOC]', '[CITATION][C]']

# Create a replacement dictionary
replace_dict = {'[PDF][PDF]':'PDF', '[BOOK][B]':'BOOK', 
                '[HTML][HTML]':'HTML', '[DOC][DOC]':'DOC', 
                '[CITATION][C]': 'CITE', 'ART':'ART'}

In [None]:
# Data on horse colic
horse_df = pd.read_csv('data/horse_colic.csv')
horse_df.head(2)

In [None]:
# Get the data shape
horse_df.shape

In [None]:
# Find how many missing values
horse_df.isna().sum()

In [None]:
# Fill in the missing values with empty string
horse_df = horse_df.fillna('')

# Check for success
horse_df.isna().sum()

In [None]:
# Create a new column to hold the labels
horse_df['Label'] = 'ART'

# Populate the Label column with the codes extracted from Title column
# This does not seem to work, it assigns other labels than the expected ones at times!
for c in codes:
    horse_df['Label'] = np.where(horse_df['Title'].str.contains(c), c, horse_df['Label'] )

# Check the output
horse_df.head(10)

In [None]:
# Replace the labels with shorter expressions
horse_df['Label'] = horse_df['Label'].replace(replace_dict)
horse_df.head(2)

In [None]:
# Delete the labels from Title column
for c in codes:
    horse_df.Title = horse_df.Title.str.replace(c,'', regex=False)
horse_df.head(2)

In [None]:
# Group items by type - there are no items books??
horse_df.groupby('Label').size()

In [None]:
# Combine Title and TruncAbstract and replace the AugmTitle column
horse_df['AugmTitle'] = horse_df['Title'] + horse_df['TruncAbstr']
horse_df.head(2)

In [None]:
# Remove the line delimitators in the AugmTitle column
horse_df.AugmTitle = horse_df.AugmTitle.str.replace('\n','', regex=False)
horse_df.head(2)

## Prepare Titles Data

In [None]:
# Prepare the text column
df_titles = horse_df[['Title']]

# Rename the title column
df_titles = df_titles.rename(columns = {'Title': 'text'})

# Check the outcome
df_titles.head(2)

In [None]:
# Check how many titles contain the word colic
contain_values = df_titles[df_titles['text'].str.contains('olic')]
len(contain_values)


In [None]:
# Save the text column to a file
df_titles.to_csv('data/horse_titles.csv')  

## Augumented Title Data

In [None]:
# Prepare the text column 
df = horse_df[['AugmTitle']]

In [None]:
# Rename the text column 
df = df.rename(columns={'AugmTitle': 'text'})
df.head(4)

In [None]:
# Check how many augmented titles contain the word colic
contains_list = df[df['text'].str.contains('olic')]
len(contains_list)

In [None]:
# Inspect the items that do not contain the word colic
list_nocolic = df[~df['text'].str.contains('olic')]
list_nocolic = list_nocolic[~list_nocolic['text'].str.contains('OLIC') ]
list_nocolic

In [None]:
len(list_nocolic)

In [None]:
# Save the text column to a file
df.to_csv('data/horse_augm_titles.csv')  

In [None]:
#for c in codes:
#    df = df.apply(lambda x: x.str.replace(c,''), regex=True, axis=1)
