## L18 - Example of EDA ##

### 1. Sorting

In [1]:
import pandas as pd
from IPython.display import display

In [None]:
d = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
display(d)

In [None]:
line_break = "\n----"
# Sort by index
display(d.sort_index())
display(d.sort_index(axis="columns"))

In [None]:
# Sort by value - Specify a key (column). Returns a view.
display(d.sort_values("b"))
# Original data is unaffected
display(d)

### 2. An example of EDA

In [2]:
# Get data from URL
titanic = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/carData/TitanicSurvival.csv')

In [None]:
# Understanding the structure of our data
print(titanic.shape)
# Display
display(titanic)

In [None]:
# Change formatting
pd.set_option('display.precision', 2)
# First 10 rows
titanic.head(n=10)

In [None]:
# Tidy up column names
titanic.columns = ["name","survived","sex","age","class"]
titanic.tail()

In [None]:
# Replacing values
# Dict-like replace
titanic = titanic.replace({'survived': {'yes': 1, 'no': 0}})

# List-like replace
#titanic = titanic.replace(['yes','no'], [1,0])
titanic.head()

In [None]:
# Descriptive statistics (numeric columns only)
titanic.describe()

In [None]:
titanic.age

In [None]:
# Display missing data rows
titanic[titanic.age.isna()]

In [None]:
# Quicker way to get counts, does not include NaN
titanic.count()
# Confirm math
print(f"Size: {titanic.shape[0]}, "
      f"NA: {titanic[titanic.age.isna()].shape[0]}, "
      f"Not-NA: {titanic[titanic.age.notna()].shape[0]}")
print(titanic[titanic.age.isna()].shape[0] + titanic[titanic.age.notna()].shape[0] == titanic.shape[0])

In [None]:
# Line-continuation inside parentheses
# https://peps.python.org/pep-0008/#maximum-line-length
print("another"
      "long"
      "line")
# Equivalent to:
print("another" "long" "line")

In [None]:
# Drop all rows which contain any missing data
titanic = titanic.dropna(axis=0)
titanic

In [None]:
# Descriptive statistics (numeric columns only)
titanic.describe()

In [None]:
titanic.age.min()*12

In [None]:
# How many months old?
print(f"Baby age: {titanic.age.min().round(2)*12} months")

### Plotting
In order to generate basic plots, we first need to install matplotlib

In [16]:
# pip install matplotlib

In [None]:
# Exploratory data plot
h = titanic.hist(bins=8, figsize=(15,5), grid=False)

In [None]:
# Age histogram
h = titanic.age.hist(bins=8)

In [None]:
# View only survived. This works as survived coded as 0 = Not survive, 1 = Survive
display(titanic[titanic.survived.astype(bool)])

# Use boolean indexing to view survivors
titanic[titanic.survived == 1].describe()

In [None]:
# Use boolean indexing to view non-survivors
titanic[titanic.survived == 0].describe()

In [None]:
#round(titanic[titanic.survived == 1].age.mean(), 4)
#titanic[titanic.survived == 'yes'].age.mean()

In [None]:
# Focusing on age
print(f"Survived: {round(titanic[titanic.survived == 1].age.mean(),2)}\n"
      f"Not Survived: {round(titanic[titanic.survived == 0].age.mean(),2)}")     

In [None]:
titanic.head()

In [20]:
#%%
#import matplotlib.pyplot as plt
#import matplotlib as mpl
#import numpy as np

#plt.hist(titanic.age)
#plt.show() 

**3. Exploring variables**

In [None]:
# Examine class with boolean indexing
status = {"surv_1st": titanic[(titanic.survived==1) & (titanic["class"]=='1st')],
          "nosurv_1st": titanic[(titanic.survived==0) & (titanic["class"]=='1st')],
          "surv_3rd": titanic[(titanic.survived==1) & (titanic["class"]=='3rd')],
          "nosurv_3rd": titanic[(titanic.survived==0) & (titanic["class"]=='3rd')]}

# Print results
print(f"1st survivors, non-survivor: {status.get('surv_1st').shape[0]},",
      f"{status.get('nosurv_1st').shape[0]}\n"
      f"3rd survivors, non-survivors: {status.get('surv_3rd').shape[0]},",
      f"{status.get('nosurv_3rd').shape[0]}")


**4. Grouped operations**

In [None]:
# Replace survivability values for readiability
titanic = titanic.replace({'survived': {1: 'yes', 0: 'no'}})
display(titanic)

In [None]:
# Grouped operations
grouped_titanic = titanic.groupby(['class','sex','survived']).size().unstack()
grouped_titanic

**5. Plotting**

In [19]:
# Seaborn bar-plot
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
p = sns.countplot(data=titanic, x="sex", hue="survived").set(title='Survival count by Sex')

In [None]:
# Change our color palette
#sns.set_palette("Paired")
# Plot class and survival
p = sns.countplot(data=titanic, x="class", hue="survived").set(title='Survival count by Class')

In [None]:
# Create grouped panda for plotting
grouped_cs_titanic = titanic.groupby(['class','survived']).size().unstack()
grouped_cs_titanic
# Matplotlib
titanic_barplot = grouped_cs_titanic.plot.bar(stacked=False, color = ["lightblue", "lightpink"])
plt.ylabel("Counts")
plt.xlabel('Passenger class')
plt.xticks(rotation=0)
plt.show(titanic_barplot)

In [None]:
titanic.groupby(['class', 'survived']).size().unstack()

In [None]:
# Some basic stats - Not covered in class (not examinable)
# Split survived and not survived
surv = titanic.loc[titanic.survived == 1]
no_surv = titanic.loc[titanic.survived == 0]

# Explanatory test
from scipy import stats
# T-test to determine if the means of each group are different?
stats.ttest_ind(surv.age, no_surv.age)