# Classification

## 0. Setting

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

### 0.1 Untitled

In [30]:
def train_test_split_df(dataframe, test_size = 0.2, random_state = None):
  if random_state:
    np.random.seed(random_state)

  # Shuffle the indices of the dataframe
  indices = np.random.permutation(len(dataframe))

  # Calculate the number if samples in the test set
  test_samples = int(len(dataframe) * test_size)

  # split indices into train and test indices
  test_indices = indices[:test_samples]
  train_indices = indices[test_samples:]

  # Split dataframe into train and test sets using the indices
  train_set = dataframe.iloc[train_indices]
  test_set = dataframe.iloc[test_indices]

  return train_set, test_set

def train_test_split_np(X, y, test_size = 0.2, random_state = None):
  if random_state:
    np.random.seed(random_state)

  # Shuffle the indices of the dataframe
  indices = np.random.permutation(len(X))

  # Calculate the number if samples in the test set
  test_samples = int(len(X) * test_size)

  # split indices into train and test indices
  test_indices = indices[:test_samples]
  train_indices = indices[test_samples:]

  # Split dataframe into train and test sets using the indices
  train_X, train_y = X[train_indices], y[train_indices]
  test_X, test_y = X[test_indices], y[test_indices]

  return train_X, train_y, test_X, test_y

def mean_squared_error(y_true, y_pred):
  # Calculate squared errors
  squared_errors = [(y_true[i] - y_pred[i]) ** 2 for i in range(len(y_true))]

  # Calculate mean squared error
  mse = sum(squared_errors) / len(y_true)

  return mse

def r_squared(y_true, y_pred):
  # Calculate the mean of the true values
  # 실제 값의 평균을 계산합니다.
  mean_y_true = sum(y_true) / len(y_true)

  # Calculate the total sum of squares (TSS)
  # 총 제곱합 (TSS)을 계산합니다.
  tss = sum([(y - mean_y_true) ** 2 for y in y_true])

  # Calculate the residual sum of squares (RSS)
  # 잔차 제곱합 (RSS)을 계산합니다.
  rss = sum([(y_true[i] - y_pred[i]) ** 2 for i in range(len(y_true))])

  # Calculate R-squared
  # R-제곱을 계산합니다.
  r2 = 1 - (rss / tss)

  return r2

def accuracy(y_true, y_pred):
  return (y_true == y_pred).sum() / len(y_true)

## 1. Data1. diabetes.csv
### 1.0 GetData

In [31]:
diabetes = pd.read_csv("/Users/sunukkim/Desktop/Workspace/ml_dl_2024_spring/lab/data/diabetes.csv")

diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [32]:
train_data, test_data = train_test_split_df(diabetes)

X_train, y_train = np.array(train_data.drop("Outcome", axis = 1)), np.array(train_data.Outcome)
X_test, y_test = np.array(test_data.drop("Outcome", axis = 1)), np.array(test_data.Outcome)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(615, 8) (615,)
(153, 8) (153,)


In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
X_test

array([[ 1.24584229,  1.86101371,  0.12768718, ...,  0.08112338,
         0.7718684 ,  0.24991699],
       [-0.8673772 , -0.05552242,  0.9800901 , ...,  1.77690186,
         0.12391521, -0.60873636],
       [ 0.3401768 , -0.3749451 ,  0.28751273, ...,  0.52525584,
         0.24311414,  2.31068505],
       ...,
       [-0.2636002 ,  1.19022607,  0.34078791, ..., -0.06692077,
         1.17531095, -0.43700569],
       [-0.5654887 , -0.2791183 ,  0.28751273, ...,  0.48488016,
        -0.97332628, -1.03806304],
       [-0.8673772 , -0.43882964,  0.12768718, ..., -0.17458925,
         1.08361946, -0.78046704]])

In [35]:
y_train.sum()

212

In [36]:
X_train.shape

(615, 8)

### 1.1 SVM Train & Performance measure

In [37]:
svc = SVC(kernel='linear', C = 1)
svc.fit(X_train, y_train)

In [38]:
y_pred = svc.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0])

In [39]:
y_test

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0])

||Predicted: No|Predicted: Yes|
|---|---|---|
|**Actual: No**|TP|FN|
|**Actual: Yes**|FP|TN|

- TP(True Positive): 긍정예측을 성공
- FN(False Negative): 부정예측을 실패
- FP(False Positive): 긍정예측을 실패
- TN(True Negative): 부정예측을 성공

In [40]:
metrics.confusion_matrix(y_test, y_pred)

array([[87, 10],
       [20, 36]])

- Accuracy: (TP + TN) / (TP + TN + FP + FN)
> 전체 예측한 것 중 올바른 예측을 얼마나 했는지

- recall: TP / (TP + FN)
> 실제로 긍정인 것 중 긍정으로 예측한 비율

- precision: TP / (TP + FP)
> 긍정으로 예측한 것 중 실제로 긍정인 비율

In [41]:
print('Accuracy Score: ')
print(metrics.accuracy_score(y_test, y_pred))
print('Recall Score: ')
print(metrics.recall_score(y_test, y_pred))
print('Precision Score: ')
print(metrics.precision_score(y_test, y_pred))

Accuracy Score: 
0.803921568627451
Recall Score: 
0.6428571428571429
Precision Score: 
0.782608695652174


### 1.3 Logistic Regression

In [42]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

In [43]:
y_pred = lr_clf.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0])

In [44]:
y_test

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0])

In [45]:
metrics.confusion_matrix(y_test, y_pred)

array([[85, 12],
       [20, 36]])

In [46]:
print('Accuracy Score: ')
print(metrics.accuracy_score(y_test, y_pred))
print('Recall Score: ')
print(metrics.recall_score(y_test, y_pred))
print('Precision Score: ')
print(metrics.precision_score(y_test, y_pred))

Accuracy Score: 
0.7908496732026143
Recall Score: 
0.6428571428571429
Precision Score: 
0.75


## 2. Data2. credit_score.csv

### 2.0 GetData

In [47]:
credit_score = pd.read_csv("/Users/sunukkim/Desktop/Workspace/ml_dl_2024_spring/lab/data/credit_score.csv")

credit_score.head()

Unnamed: 0,CUST_ID,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,...,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE,DEFAULT
0,C02COQEVYU,33269,0,532304,0.0,16.0,1.2,1889,945,0.5003,...,0.0,0.0625,High,1,0,0,0,0,444,1
1,C02OZKC0ZF,77158,91187,315648,1.1818,4.0909,3.4615,5818,111,0.0191,...,0.7692,0.2222,No,1,0,0,1,0,625,0
2,C03FHP2D0A,30917,21642,534864,0.7,17.3,24.7142,1157,860,0.7433,...,1.4286,0.0578,High,1,0,0,1,0,469,1
3,C03PVPPHOY,80657,64526,629125,0.8,7.8,9.7499,6857,3686,0.5376,...,1.25,0.1282,High,1,0,0,1,0,559,0
4,C04J69MUX0,149971,1172498,2399531,7.8182,16.0,2.0465,1978,322,0.1628,...,0.1163,0.0568,High,1,1,1,1,1,473,0


In [48]:
credit_score['CAT_GAMBLING'] = credit_score.CAT_GAMBLING.map({'No':0, 'High':1})
credit_score = credit_score.drop("CUST_ID", axis = 1)

In [49]:
credit_score.head()

Unnamed: 0,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,R_CLOTHING_INCOME,...,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE,DEFAULT
0,33269,0,532304,0.0,16.0,1.2,1889,945,0.5003,0.0568,...,0.0,0.0625,1.0,1,0,0,0,0,444,1
1,77158,91187,315648,1.1818,4.0909,3.4615,5818,111,0.0191,0.0754,...,0.7692,0.2222,0.0,1,0,0,1,0,625,0
2,30917,21642,534864,0.7,17.3,24.7142,1157,860,0.7433,0.0374,...,1.4286,0.0578,1.0,1,0,0,1,0,469,1
3,80657,64526,629125,0.8,7.8,9.7499,6857,3686,0.5376,0.085,...,1.25,0.1282,1.0,1,0,0,1,0,559,0
4,149971,1172498,2399531,7.8182,16.0,2.0465,1978,322,0.1628,0.0132,...,0.1163,0.0568,1.0,1,1,1,1,1,473,0


In [50]:
train_data, test_data = train_test_split_df(credit_score)

X_train, y_train = np.array(train_data.drop("DEFAULT", axis = 1)), np.array(train_data.DEFAULT)
X_test, y_test = np.array(test_data.drop("DEFAULT", axis = 1)), np.array(test_data.DEFAULT)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(800, 85) (800,)
(200, 85) (200,)


In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [52]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

imputer = SimpleImputer(strategy = 'mean') 

In [59]:
svc = SVC(kernel='linear', C = 1)

pipeline_svm = Pipeline([
    ('imputer', imputer),
    ('svc', svc)
])

pipeline_svm.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
y_pred

In [None]:
metrics.confusion_matrix(y_test, y_pred)

In [None]:
print('Accuracy Score: ')
print(metrics.accuracy_score(y_test, y_pred))
print('Recall Score: ')
print(metrics.recall_score(y_test, y_pred))
print('Precision Score: ')
print(metrics.precision_score(y_test, y_pred))

In [53]:
logistic = LogisticRegression()

pipeline = Pipeline([
    ('imputer', imputer),
    ('logistic', logistic)
])

pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
y_pred = pipeline.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0])

In [56]:
metrics.confusion_matrix(y_test, y_pred)

array([[136,   6],
       [ 50,   8]])

In [57]:
print('Accuracy Score: ')
print(metrics.accuracy_score(y_test, y_pred))
print('Recall Score: ')
print(metrics.recall_score(y_test, y_pred))
print('Precision Score: ')
print(metrics.precision_score(y_test, y_pred))

Accuracy Score: 
0.72
Recall Score: 
0.13793103448275862
Precision Score: 
0.5714285714285714
