In [None]:
import pandas as pd
from typing import Tuple, List, Dict
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Linear regression

In [None]:
# height (cm)
X = np.array(
    [[147, 150, 153, 158, 163, 165, 168, 170, 173, 175, 178, 180, 183]]).T
# weight (kg)
y = np.array([[49, 50, 51,  54, 58, 59, 60, 62, 63, 64, 66, 67, 68]]).T
# Visualize data
plt.plot(X, y, 'ro')
plt.axis([140, 190, 45, 75])
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.show()

In [None]:
# Building Xbar
one = np.ones((X.shape[0], 1))
Xbar = np.concatenate((one, X), axis=1)

# Calculating weights of the fitting line
A = np.dot(Xbar.T, Xbar)
b = np.dot(Xbar.T, y)
w = np.dot(np.linalg.pinv(A), b)
print('w = ', w)
# Preparing the fitting line
w_0 = w[0][0]
w_1 = w[1][0]

x0 = np.linspace(145, 185, 2, endpoint=True)
y0 = w_0 + w_1*x0

print(w_0, w_1)
# Drawing the fitting line
plt.plot(X.T, y.T, 'ro')     # data
plt.plot(x0, y0)               # the fitting line
plt.axis([140, 190, 45, 75])
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.show()

In [None]:
from sklearn import datasets, linear_model

# fit the model by Linear Regression
# fit_intercept = False for calculating the bias
regr = linear_model.LinearRegression(fit_intercept=False)
regr.fit(Xbar, y)

print(u'Nghiệm tìm được bằng scikit-learn  : ', regr.coef_)
w_0 = regr.coef_[0][0]
w_1 = regr.coef_[0][1]
print(w_0, w_1)
x0 = np.linspace(145, 185, 2, endpoint=True)
y0 = w_0 + w_1*x0

# Drawing the fitting line
plt.plot(X.T, y.T, 'ro')     # data
plt.plot(x0, y0)               # the fitting line
plt.axis([140, 190, 45, 75])
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# height (cm)
X = np.array([[147, 150, 153, 158, 163, 165, 168,
             170, 173, 175, 178, 180, 183, 150]]).T
# weight (kg)
y = np.array([[49, 50, 51,  54, 58, 59, 60, 62, 63, 64, 66, 67, 68, 90]]).T

# Building Xbar
one = np.ones((X.shape[0], 1))
Xbar = np.concatenate((one, X), axis=1)

# Calculating weights of the fitting line
A = np.dot(Xbar.T, Xbar)
b = np.dot(Xbar.T, y)
w = np.dot(np.linalg.pinv(A), b)
print('w = ', w)
# Preparing the fitting line
w_0 = w[0][0]
w_1 = w[1][0]
x0 = np.linspace(145, 185, 2, endpoint=True)
y0 = w_0 + w_1*x0

# Drawing the fitting line
plt.plot(X, y, 'ro')     # data
plt.plot(x0, y0)               # the fitting line
plt.axis([140, 190, 45, 95])
plt.xlabel('Height (cm)')
plt.ylabel('Weight (kg)')
plt.show()


# Logistic regression

## preprocess data

In [None]:
df_train = pd.read_csv('./Train_samsung.csv')
df_test = pd.read_csv("./Test_samsung_noclass.csv")
df_train.dtypes

In [None]:
def print_df(dataframe: pd.DataFrame):
    print(tabulate(dataframe, headers='keys', tablefmt='psql'))

In [None]:
# print_df(dataframe=df_train)
display(df_train)

In [None]:
def check_Nan_values(df: pd.DataFrame):
    for key in df.keys():
        if df[key].isnull().sum() > 0:
            print(key, df[key].isnull().sum())

In [None]:
def numberic(dataframe: pd.DataFrame):
    df = dataframe.copy()
    for key in df.keys():
        if df[key].dtype == object:
            df[key] = LabelEncoder().fit_transform(df[key])

    return df

In [None]:
def normailize(dataframe: pd.DataFrame):
    df = dataframe.copy()
    for key in df.keys():
        if df[key].dtype != object:
            min = df[key].min()
            max = df[key].max()
            df[key] = (df[key] - min)/(max-min)
            df[key] = df[key].round(decimals=4)
    return df

In [None]:
check_Nan_values(df=df_train)

In [None]:
df_train = normailize(dataframe=df_train)

In [None]:
df_train = numberic(dataframe=df_train)
# print_df(dataframe=df_train)
display(data_train)

In [None]:
data_train = df_train.copy()

In [None]:
data_train.describe()

In [None]:
def iqr_outliers(dataframe: pd.DataFrame):
    df = dataframe.copy()
    for key in df.keys():
        if key != 'Class':
            q1 = df[key].quantile(0.25)
            q3 = df[key].quantile(0.75)
            iqr = q3-q1
            Lower_tail = q1 - 1.5 * iqr
            Upper_tail = q3 + 1.5 * iqr
            if Lower_tail != Upper_tail:
                df = df.drop(df[df[key] > Upper_tail].index)
                df = df.drop(df[df[key] < Lower_tail].index)
            print(key)
            print(f"Lower_tail: {Lower_tail}")
            print(f"Upper_tail: {Upper_tail}")
            print("\n")
    return df

In [None]:
data_train = iqr_outliers(dataframe=data_train)

In [None]:
# print_df(data_train)
display(data_train)

In [None]:
for key in data_train.keys():
    display(data_train[key].agg(['mean', 'median', 'std'], axis='rows'))

In [None]:
def replace_Nan_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    df = dataframe.copy()
    for key in df.keys():
        if df[key].isnull().sum() > 0:
            df[key].fillna(round(df[key].median()), inplace=True)
    return df

In [None]:
data_train = replace_Nan_data(dataframe=data_train)
# print_df(dataframe=data_train)
display(data_train)

In [None]:
data_train.describe()

In [None]:
check_Nan_values(data_train)

In [None]:
sns.heatmap(df_train.corr(), annot=True, cmap='RdYlGn',
            linewidths=0.2)
fig = plt.gcf()
fig.set_size_inches(12, 8)
plt.show()

## classification

### cross validation

In [None]:
logit = LogisticRegression()
standard_scaler = StandardScaler()

scoring = ['accuracy', 'f1']

pipeLine = make_pipeline(standard_scaler, logit)
kf = KFold(n_splits=5, shuffle=True, random_state=2)

cv_resutls = cross_validate(
    pipeLine, data_train, labels_train, cv=kf, n_jobs=-1, scoring=scoring)

cv_resutls

In [None]:
cv_resutls['test_f1'].mean()