In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

# Load the "application_train.csv" dataset
def load_application_train():
    data = pd.read_csv("application_train.csv")
    return data

# Load the "titanic.csv" dataset
def load():
    data = pd.read_csv("titanic.csv")
    return data

# See the shape of the bigger dataset
df_application_train = load_application_train()
print(df_application_train.shape)  # (307511, 122)

# See the shape of the smaller dataset
df_titanic = load()
print(df_titanic.shape)  # (891, 12)

# Detect outliers in the 'Age' column of the Titanic dataset using a boxplot
sns.boxplot(x=df_titanic["Age"])
plt.show()

# Function to calculate outlier thresholds
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

# Outlier thresholds for 'Age'
print(outlier_thresholds(df_titanic, "Age"))  # (-6.6875, 64.8125)

# Outlier thresholds for 'Fare'
low, up = outlier_thresholds(df_titanic, "Fare")
print(df_titanic[(df_titanic["Fare"] < low) | (df_titanic["Fare"] > up)].head())

# Function to check for outliers
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

print(check_outlier(df_titanic, "Age"))    # True
print(check_outlier(df_titanic, "Fare"))   # True

# Function to grab column names
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O" and col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df_titanic)

num_cols = [col for col in num_cols if col not in "PassengerId"]
print(num_cols)  # ['Age', 'Fare']

# Check outliers in numerical columns
for col in num_cols:
    print(col, check_outlier(df_titanic, col))

# Check outliers in the "application_train.csv" dataset
dff = load_application_train()
cat_cols, num_cols, cat_but_car = grab_col_names(dff)
num_cols.remove('SK_ID_CURR')

# Check which numerical columns have outliers
for col in num_cols:
    print(col, check_outlier(dff, col))

# Function to grab outliers
def grab_outliers(dataframe, col_name, outlier_index=False, f=5):
    low, up = outlier_thresholds(dataframe, col_name)
    outliers = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))]
    if outliers.shape[0] > 10:
        print(outliers.head(f))
    else:
        print(outliers)
    if outlier_index:
        out_index = outliers.index
        return out_index

age_index = grab_outliers(df_titanic, "Age", True)
print(age_index)
