# Preprocessing
### Author: Prof. Sandro Camargo <github.com/sandrocamargo>
### Data Mining Course <https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213>
#### This script uses the basic concepts of preprocessing.
##### In this script, we used the iris dataset https://archive.ics.uci.edu/dataset/47/horse+colic


In [None]:
# Download and unzip the dataset
!wget -c https://archive.ics.uci.edu/static/public/47/horse+colic.zip
!unzip -u horse+colic.zip

Importing a dataset without properly setting the importation parameters.

Data inspection "data.head()" shows the dataset loaded in a single column

In [None]:
# import and inspect the dataset
import pandas as pd

data = pd.read_csv('horse-colic.data')
data.head() # Show first 5 samples

In [None]:
data = pd.read_csv('horse-colic.data', delimiter=" ", header=None, usecols=[*range(0,28)], na_values="?")
data.columns = ['Surgery','Age','Hospital Number','Rectal Temperature','Pulse','Respiratory Rate','Temperature of Extremities','Peripheral Pulse','Mucous Membranes','Capillary Refill Time','Pain','Peristalsis','Abdominal Distension','Nasogastric Tube','Nasogastric Reflux','Nasogastric Reflux PH','Rectal Examination Feces','Abdomen','Packed Cell Volume','Total Protein','Abdominocentesis Appearance','Abdominocentesis Total Protein','Outcome','Surgical Lesion','Lesion Site','Lesion Type','Lesion Subtype','CP Data']
data.head() # Show first 5 samples

# Getting to know your data

In [None]:
print(data.describe())

# Transformation

In [None]:
# Transforming numerical to class
print(data['Surgery'].value_counts())
data['Surgery'] = data['Surgery'].astype(str)
#print(data['Surgery'])
data['Surgery'].replace('1.0', 'Yes', inplace=True)
data['Surgery'].replace('2.0', 'No', inplace=True)
print(data['Surgery'].value_counts())


In [None]:
!pip install ydata_profiling
from ydata_profiling import ProfileReport, compare
report = ProfileReport(df=data, title="Horse Colic data profile")
report.to_file("profile_report.html")
report.to_notebook_iframe()

In [None]:
# plot the boxplot of a single variable
sns.boxplot(x='Surgery', y='Pulse', data=data)
print(data['Pulse'].describe())

In [None]:
# plot the boxplot of a single variable
sns.boxplot(x='Surgery', y='Respiratory Rate', data=data)
print(data['Respiratory Rate'].describe())

# Values Imputation

In [None]:
# Removing missing values
print("Original DataFrame Dimensions:",data.shape)

cleaned_data = data.dropna()
print("Cleaned DataFrame Dimensions:",cleaned_data.shape)

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

# Filling missing values using mean
print("Original DataFrame Dimensions:",data.shape)
print(data['Pulse'].describe())
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(data[['Pulse']])
print(imp_mean.transform(data[['Pulse']]))

In [None]:
# Filling missing values using median
print("Original DataFrame Dimensions:",data.shape)
print(data['Respiratory Rate'].describe())
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
imp_mean.fit(data[['Respiratory Rate']])
print(imp_mean.transform(data[['Respiratory Rate']]))

In [None]:
# Filling missing values using the most frequent value
print("Original DataFrame Dimensions:",data.shape)
print(data['Age'].value_counts())
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean.fit(data[['Age']])
print(imp_mean.transform(data[['Age']]))

In [None]:
# Getting to know your data"
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(data, hue='Surgery', markers=["o", "s", "D"])
plt.savefig("horse-colic-pairplot.pdf")