## Missing Values

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('titanic.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
# cac hanh khach ko co du lieu ve dia diem len tau
df[df['Embarked'].isnull()]

In [None]:
import numpy as np
df['cabin_null']=np.where(df['Cabin'].isnull(),1,0)

## Tim phan tram hanh khach ko co du lieu ve cho ngoi 
df['cabin_null'].mean()

In [None]:
# Tim phan tram hanh khach ko co du lieu ve cho ngoi phan theo tinh trang song/chet
df.groupby(['Survived'])['cabin_null'].mean()

### Các kỹ thuật xử lý dữ liệu trống

1. Mean/ Median/Mode replacement
2. Random Sample Imputation
3. Capturing NAN values with a new feature
4. End of Distribution imputation
5. Arbitrary Value Imputation


#### Thay thế bằng Mean/Median/Mode
- Kỹ thuật này giả định rằng dữ liệu trống hoàn toàn ngẫu nhiên
- Thay thế dữ liệu trống bằng Mean/Median/Mode của cột

In [None]:
df=pd.read_csv('titanic.csv',usecols=['Age','Fare','Survived'])
df.head()

In [None]:
## Phan tram du lieu trong cua moi cot
df.isnull().mean()

In [None]:
# dien du lieu trong cua cot Age
median=df['Age'].median()
print(median)
df['Age_median']=df['Age'].fillna(median)
df.head()

In [None]:
# in ra do lech chuan truoc va sau khi dien du lieu trong
print(df['Age'].std())
print(df['Age_median'].std())

In [None]:
# ve ham mat do xac suat cua cot Age truoc va sau khi dien du lieu trong
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
df['Age'].plot(kind='kde', color='blue')
df['Age_median'].plot(kind='kde', color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

* 7 kỹ thuật xử lý dữ liệu trống

In [None]:
# Median
median = df['Age'].median()
df['Age_median'] = df['Age'].fillna(median)
pdf(df['Age'], df['Age_median'])

# Mean 
mean = df['Age'].mean()
df['Age_mean'] = df['Age'].fillna(mean)
pdf(df['Age'], df['Age_mean'])

# Mode 
mode = df['Age'].mode().values[0]
df['Age_mode'] = df['Age'].fillna(mode)
pdf(df['Age'], df['Age_mode'])

# Random value
random_samples = df['Age'].dropna().sample(n = df['Age'].isnull().sum(), random_state = 0)
random_samples.index = df['Age'][df['Age'].isnull()].index
# Replace nah values by random values
df['Age_random'] = df['Age']
df.loc[df['Age'].isnull(), 'Age_random']=random_samples
pdf(df['Age'], df['Age_random'])


# End of distribution 
age_end = df['Age'].mean() + 3 * df['Age'].std()
df['Age_end'] = df['Age'].fillna(age_end)
pdf(df['Age'], df['Age_end'])

# Fix Value
df['Age_fix_value'] = df['Age'].fillna(30) 
pdf(df['Age'], df['Age_fix_value']) 

# New feature
df['Age_Nah'] = df['Age']
df.loc[df['Age'].isnull(), 'Age_Nah']=0
df.loc[df['Age']>0, 'Age_Nah']=1

In [None]:
def pdf(old_data, new_data):
    # ve ham mat do xac suat cua cot Age truoc va sau khi dien du lieu trong
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111)
    old_data.plot(kind='kde', color='blue')
    new_data.plot(kind='kde', color='red')
    lines, labels = ax.get_legend_handles_labels()
    ax.legend(lines, labels, loc='best')

* Xử lý ngoại lệ

In [None]:
df['Age'].describe()


### Dữ liệu cột 'Fare' có dạng phân bố lệch (skewed) nên ta có: 
* ==> + Biên trên = 3rd Quantile + 3*IQR
* ==> + Biên dưới =  1st Quantile - 3*IQR

* IQR: Interquantile range:
    * 3rd Quantile = Percentile 75
    * 1st Quantile = Percentile 25

In [None]:
IQR = df['Fare'].quantile(0.75) - df['Fare'].quantile(0.25)
print(IQR)  

In [None]:
upper_boundary = df['Fare'].quantile(0.75) + 3 * IQR
lower_boundary = df['Fare'].quantile(0.25) - 3 * IQR
print(upper_boundary)
print(lower_boundary)

### Cột Age phân bố chuẩn nên sử dụng công thức tìm cận trên cận dưới như sau:
- Biên trên = GTTB + 3*Độ lệch chuẩn 
- Biên dưới = GTTB - 3*Độ lệch chuẩn

In [None]:
uppper_boundary = df['Age_mean'].mean() + 3 * df['Age_mean'].std()
lower_boundary = df['Age_mean'].mean() - 3 * df['Age_mean'].std()
print(lower_boundary), print(uppper_boundary), print(df['Age_mean'].mean())


In [None]:
data = df.copy()
data_non_pr = df.copy()


In [None]:
data.loc[data['Age_mean'] >= 68, 'Age_mean'] = 68
data.loc[data['Fare'] >= 100, 'Fare'] = 100

In [None]:
figure = data.Age.hist(bins=50)
figure.set_title('Age')
figure.set_xlabel('Age')
figure.set_ylabel('Nber of passengers')

In [None]:
figure = data.Fare.hist(bins=50)
figure.set_title('Fare')
figure.set_xlabel('Fare')
figure.set_ylabel('Nber of passengers')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
handle_ex = []
non_handle_ex = []
for random_state in range(10):
    x1_train, x1_test, y1_train, y1_test = train_test_split(
        data[['Age_mean', 'Fare']], data['Survived'], test_size=0.3, random_state = random_state)
    x2_train, x2_test, y2_train, y2_test = train_test_split(
        data_non_pr[['Age_mean', 'Fare']], data_non_pr['Survived'], test_size=0.3, random_state = random_state)
    classifier.fit(x1_train, y1_train)
    y1_predict = classifier.predict(x1_test)
    classifier.fit(x2_train, y2_train)
    y2_predict = classifier.predict(x2_test)
    handle_ex.append(accuracy_score(y1_test, y1_predict))
    non_handle_ex.append(accuracy_score(y2_test, y2_predict))

In [None]:
x1_train


In [None]:
x1_test

In [None]:
data.isnull().sum()

In [None]:
print(handle_ex)
print(non_handle_ex)

In [None]:
print("Accuracy (Handle exception): {}".format(accuracy_score(y1_test, y1_predict)))
print("Accuracy (Non handle exception): {}".format(accuracy_score(y2_test, y2_predict)))