### 트위터 유저와 mbti
- datasets: mbti_label.csv, user_info.csv
- machine learning method: supervised learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pickle

In [2]:
user_path = './data/user_info.csv'
mbti_path = './data/mbti_labels.csv'

In [3]:
user_df = pd.read_csv(user_path)
mbti_df = pd.read_csv(mbti_path)

In [None]:
user_df.info()

In [None]:
user_df.head()

In [None]:
user_df.value_counts()

In [None]:
user_df.isna().sum()

In [None]:
mbti_df.info()

In [None]:
mbti_df.head()

In [None]:
mbti_df.isna().sum()

In [None]:
mbti_df['mbti_personality'].value_counts()

In [None]:
user_df.columns

In [None]:
user_df['id']

In [None]:
mbti_df['id']

In [16]:
feature_df = pd.read_csv('./data/feature.csv', encoding='utf-8')
feature_df = feature_df.drop(columns=['Unnamed: 0'])
label_sr = pd.read_csv('./data/target.csv', encoding='utf-8')
label_sr = label_sr.drop(columns=['Unnamed: 0'])
label_sr = label_sr['mbti_personality']

In [None]:
feature_df.head()

In [None]:
feature_df.info()

In [None]:
feature_df['number_of_tweets_scraped'].unique()

In [None]:
feature_df = feature_df.astype({'number_of_tweets_scraped':'int64'})
feature_df.info()

In [None]:
plt.figure(figsize=(500000, 500000))
feature_df.hist()
plt.show()

In [None]:
label_sr.head()

In [None]:
# 상관관계를 보기 위한
data_df = feature_df.copy()
data_df['mbti'] = label_sr

names = data_df['mbti'].unique().tolist() 
mapping = {name:idx for idx, name in enumerate(names)}  # dict comprehension
data_df['mbti_code'] = data_df['mbti'].map(mapping)

data_df.head()

In [24]:
import seaborn as sns

In [None]:
corr_mbti = data_df.corr(numeric_only=True)

plt.figure(figsize=(10,10))
sns.heatmap(corr_mbti, annot=True, cmap='BuPu', fmt='.2f', linewidths=0.5)
plt.title('mbti correlation')
plt.show()

In [None]:
user_df.columns

In [None]:
data_df2 = user_df.iloc[:, 7:]
data_df2['mbti'] = label_sr

names = data_df2['mbti'].unique().tolist() 
mapping = {name:idx for idx, name in enumerate(names)}  # dict comprehension
data_df2['mbti_code'] = data_df2['mbti'].map(mapping)

data_df2.head()

In [None]:
corr_mbti2 = data_df2.corr(numeric_only=True)

plt.figure(figsize=(20,20))
sns.heatmap(corr_mbti2, annot=True, cmap='BuPu', fmt='.2f', linewidths=0.5, cbar=False)
plt.title('mbti correlation')
plt.show()

- 상관관계가 높은 칼럼이 없음....

- 분류모델을 뭘 쓸까 고민하다가... 한번쯤 해보고 싶었던 Support Vector Machine으로 Classification을 해보기로 결정
- feature는 상관관계가 높은 것이 없어서 다른 조원들과 동일하게 total_~~ 를 사용.
- label_sr은 data_df의 'mbti_code'를 사용
- 
- 
- goal: mbti classification
- learning method: supervised learning, classification
- learning algorithm: support vector machine
- feature: feature.csv
- label: label encoded target.csv
- scale: undetermined
- scaler algorithm: standard, quantile

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [None]:
label_sr2 = data_df['mbti_code']

# tarin : test = 8 : 2
# random_state: 38
# stratify: label_sr2

X_train, X_test, y_train, y_test = train_test_split(feature_df, label_sr2, test_size=0.2, stratify=label_sr2, random_state=38)

print(f"X train: {X_train.shape}, {X_train.ndim}D")
print(f"y train: {y_train.shape}, {y_train.ndim}D\n")
print(f"X test: {X_test.shape}, {X_test.ndim}D")
print(f"y test: {y_test.shape}, {y_test.ndim}D\n")
print(f"train test ratio: {len(X_train)/len(feature_df)*100:.2f} %, {len(X_test)/len(feature_df)*100:.2f} %")

In [None]:
# model insrance
# default params use
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [None]:
# model attribute
# svm_coef = svm_model.coef_    # model kernel=linear
svm_dual_coef = svm_model.dual_coef_
svm_bais = svm_model.intercept_
svm_sv = svm_model.support_vectors_

print(f"dual coef:\n{svm_dual_coef}")
print(f"bais:\n{svm_bais}")
print(f"support vector:\n{svm_sv}")

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [34]:
svm_train_score = svm_model.score(X_train, y_train)
svm_test_score = svm_model.score(X_test, y_test)

svm_pred = svm_model.predict(X_test)

svm_acc = accuracy_score(y_test, svm_pred)
svm_mat = confusion_matrix(y_test, svm_pred)
svm_report = classification_report(y_test, svm_pred, zero_division=0)

In [None]:
print(f"normal svm train score: {svm_train_score*100:.2f} %")
print(f"normal svm test score: {svm_test_score*100:.2f} %\n")

# print(f"normal svm confusion matrix:\n{svm_mat}\n")

print(f"normal svm accuracy: {svm_acc*100:.2f} %")
print(f"normal svm classification report:\n{svm_report}")


- mbti 16개를 전부 분류를 하는 것은 무리라고 판단
- 4개로 E/I, N/S, T/F, P/J로 각각 이진 분류를 하는 것을 목표로!!

In [None]:
label_df = pd.read_csv('./data/target_2.csv', encoding='utf-8')
label_df.head()

In [None]:
# 상관관계를 보기 위한
data_df = feature_df.copy()
data_df['ei'] = label_df['ie']
data_df['ns'] = label_df['ns']
data_df['tf'] = label_df['ft']
data_df['pj'] = label_df['jp']

for mbti in ['ei', 'ns', 'tf', 'pj']:
    names = data_df[mbti].unique().tolist() 
    mapping = {name:idx for idx, name in enumerate(names)}  # dict comprehension
    data_df[mbti+'_code'] = data_df[mbti].map(mapping)

data_df['ei'].value_counts()

In [None]:
corr_mbti = data_df.corr(numeric_only=True)

plt.figure(figsize=(10,10))
sns.heatmap(corr_mbti, annot=True, cmap='BuPu', fmt='.2f', linewidths=0.5, cbar=False)
plt.title('mbti correlation')
plt.show()

- 여전히 상관관계는 금쪽이였다....

In [None]:
# label_ei = data_df['ei_code']
# label_ns = data_df['ns_code']
# label_tf = data_df['tf_code']
# label_pj = data_df['pj_code']

label_ei = data_df['ei']
label_ns = data_df['ns']
label_tf = data_df['tf']
label_pj = data_df['pj']


# tarin : test = 8 : 2
# random_state: 38
# stratify: label_...

ei_X_train, ei_X_test, ei_y_train, ei_y_test = train_test_split(feature_df, label_ei, test_size=0.2, stratify=label_ei, random_state=38)
ns_X_train, ns_X_test, ns_y_train, ns_y_test = train_test_split(feature_df, label_ns, test_size=0.2, stratify=label_ns, random_state=38)
tf_X_train, tf_X_test, tf_y_train, tf_y_test = train_test_split(feature_df, label_tf, test_size=0.2, stratify=label_tf, random_state=38)
pj_X_train, pj_X_test, pj_y_train, pj_y_test = train_test_split(feature_df, label_pj, test_size=0.2, stratify=label_pj, random_state=38)

print('ei 하나만')
print(f"EI X train: {ei_X_train.shape}, {ei_X_train.ndim}D")
print(f"EI y train: {ei_y_train.shape}, {ei_y_train.ndim}D\n")
print(f"EI X test: {ei_X_test.shape}, {ei_X_test.ndim}D")
print(f"EI y test: {ei_y_test.shape}, {ei_y_test.ndim}D\n")
print(f"EI train test ratio: {len(ei_X_train)/len(feature_df)*100:.2f} %, {len(ei_X_test)/len(feature_df)*100:.2f} %")

In [None]:
# model insrance
# default params use
ein_svm_model = SVC()
ein_svm_model.fit(ei_X_train, ei_y_train)

In [None]:
# model insrance
# default params use
nsn_svm_model = SVC()
nsn_svm_model.fit(ns_X_train, ns_y_train)

In [None]:
# model insrance
# default params use
tfn_svm_model = SVC()
tfn_svm_model.fit(tf_X_train, tf_y_train)

In [None]:
# model insrance
# default params use
pjn_svm_model = SVC()
pjn_svm_model.fit(pj_X_train, pj_y_train)

In [None]:
# model attribute
# ein_svm_coef = ein_svm_model.coef_    # model kernel=linear
ein_svm_dual_coef = ein_svm_model.dual_coef_
ein_svm_bais = ein_svm_model.intercept_
ein_svm_sv = ein_svm_model.support_vectors_

nsn_svm_dual_coef = nsn_svm_model.dual_coef_
nsn_svm_bais = nsn_svm_model.intercept_
nsn_svm_sv = nsn_svm_model.support_vectors_

tfn_svm_dual_coef = tfn_svm_model.dual_coef_
tfn_svm_bais = tfn_svm_model.intercept_
tfn_svm_sv = tfn_svm_model.support_vectors_

pjn_svm_dual_coef = pjn_svm_model.dual_coef_
pjn_svm_bais = pjn_svm_model.intercept_
pjn_svm_sv = pjn_svm_model.support_vectors_

print(f"ei dual coef:\n{ein_svm_dual_coef}")
print(f"ei bais:\n{ein_svm_bais}")
print(f"ei support vector:\n{ein_svm_sv}")

In [84]:
def save_model(model,save_file):
    # vocab 저장
    with open(save_file, 'wb') as f:
        pickle.dump(model, f)

In [46]:
ein_svm_train_score = ein_svm_model.score(ei_X_train, ei_y_train)
ein_svm_test_score = ein_svm_model.score(ei_X_test, ei_y_test)

ein_svm_pred = ein_svm_model.predict(ei_X_test)

ein_svm_acc = accuracy_score(ei_y_test, ein_svm_pred)
ein_svm_mat = confusion_matrix(ei_y_test, ein_svm_pred)
ein_svm_report = classification_report(ei_y_test, ein_svm_pred, zero_division=0)

In [47]:
nsn_svm_train_score = nsn_svm_model.score(ns_X_train, ns_y_train)
nsn_svm_test_score = nsn_svm_model.score(ns_X_test, ns_y_test)

nsn_svm_pred = nsn_svm_model.predict(ns_X_test)

nsn_svm_acc = accuracy_score(ns_y_test, nsn_svm_pred)
nsn_svm_mat = confusion_matrix(ns_y_test, nsn_svm_pred)
nsn_svm_report = classification_report(ns_y_test, nsn_svm_pred, zero_division=0)

In [48]:
tfn_svm_train_score = tfn_svm_model.score(tf_X_train, tf_y_train)
tfn_svm_test_score = tfn_svm_model.score(tf_X_test, tf_y_test)

tfn_svm_pred = tfn_svm_model.predict(tf_X_test)

tfn_svm_acc = accuracy_score(tf_y_test, tfn_svm_pred)
tfn_svm_mat = confusion_matrix(tf_y_test, tfn_svm_pred)
tfn_svm_report = classification_report(tf_y_test, tfn_svm_pred, zero_division=0)

In [49]:
pjn_svm_train_score = pjn_svm_model.score(pj_X_train, pj_y_train)
pjn_svm_test_score = pjn_svm_model.score(pj_X_test, pj_y_test)

pjn_svm_pred = pjn_svm_model.predict(pj_X_test)

pjn_svm_acc = accuracy_score(pj_y_test, pjn_svm_pred)
pjn_svm_mat = confusion_matrix(pj_y_test, pjn_svm_pred)
pjn_svm_report = classification_report(pj_y_test, pjn_svm_pred, zero_division=0)

In [None]:
print(f"normal E&I svm train score: {ein_svm_train_score*100:.2f} %")
print(f"normal E&I svm test score: {ein_svm_test_score*100:.2f} %\n")

print("E:0 I:1")
print(f"normal E&I svm confusion matrix:\n{ein_svm_mat}\n")

print(f"normal E&I svm accuracy: {ein_svm_acc*100:.2f} %")
print(f"normal E&I svm classification report:\n{ein_svm_report}")


In [None]:
print(f"normal N&S svm train score: {nsn_svm_train_score*100:.2f} %")
print(f"normal N&S svm test score: {nsn_svm_test_score*100:.2f} %\n")

print("N:0 S:1")
print(f"normal N&S svm confusion matrix:\n{nsn_svm_mat}\n")

print(f"normal N&S svm accuracy: {nsn_svm_acc*100:.2f} %")
print(f"normal N&S svm classification report:\n{nsn_svm_report}")


In [None]:
print(f"normal T&F svm train score: {tfn_svm_train_score*100:.2f} %")
print(f"normal T&F svm test score: {tfn_svm_test_score*100:.2f} %\n")

print("F:0 T:1")
print(f"normal T&F svm confusion matrix:\n{tfn_svm_mat}\n")

print(f"normal T&F svm accuracy: {tfn_svm_acc*100:.2f} %")
print(f"normal T&F svm classification report:\n{tfn_svm_report}")


In [None]:
print(f"normal P&J svm train score: {pjn_svm_train_score*100:.2f} %")
print(f"normal P&J svm test score: {pjn_svm_test_score*100:.2f} %\n")

print("J:0 p:1")
print(f"normal P&J svm confusion matrix:\n{pjn_svm_mat}\n")

print(f"normal P&J svm accuracy: {pjn_svm_acc*100:.2f} %")
print(f"normal P&J svm classification report:\n{pjn_svm_report}")


- 순서대로 55, 75, 55, 55 정도로 나옴.
- params을 어떻게 건드릴까?

- scaling을 해달라고 해서 standatd만 일단 ㄱㄱ

In [54]:
from sklearn.preprocessing import StandardScaler

In [55]:
ei_sd_scaler = StandardScaler()
ei_sd_scaler.fit(ei_X_train, ei_y_train)

ei_sd_X_train = ei_sd_scaler.transform(ei_X_train)
ei_sd_X_test = ei_sd_scaler.transform(ei_X_test)

In [56]:
ns_sd_scaler = StandardScaler()
ns_sd_scaler.fit(ns_X_train, ns_y_train)

ns_sd_X_train = ns_sd_scaler.transform(ns_X_train)
ns_sd_X_test = ns_sd_scaler.transform(ns_X_test)

In [57]:
tf_sd_scaler = StandardScaler()
tf_sd_scaler.fit(tf_X_train, tf_y_train)

tf_sd_X_train = tf_sd_scaler.transform(tf_X_train)
tf_sd_X_test = tf_sd_scaler.transform(tf_X_test)

In [58]:
pj_sd_scaler = StandardScaler()
pj_sd_scaler.fit(pj_X_train, pj_y_train)

pj_sd_X_train = pj_sd_scaler.transform(pj_X_train)
pj_sd_X_test = pj_sd_scaler.transform(pj_X_test)

In [59]:
from sklearn.model_selection import GridSearchCV

In [60]:
# 'kernel':['poly', 'rbf', 'sigmoid']
# 'max_iter':[-1, 0,  1]

params = {'C':[0.01, 0.5, 0.1, 1.0, 2.5],'degree':[3, 4, 5, 6, 7, 8],
          'decision_function_shape':['ovo', 'ovr']}

In [61]:
# cv = 5
ei_grid_model = GridSearchCV(SVC(), param_grid=params, cv=5, refit=True, return_train_score=True, verbose=2)
ns_grid_model = GridSearchCV(SVC(), param_grid=params, cv=5, refit=True, return_train_score=True, verbose=2)
tf_grid_model = GridSearchCV(SVC(), param_grid=params, cv=5, refit=True, return_train_score=True, verbose=2)
pj_grid_model = GridSearchCV(SVC(), param_grid=params, cv=5, refit=True, return_train_score=True, verbose=2)

In [None]:
ei_grid_model.fit(ei_sd_X_train, ei_y_train)

In [None]:
ns_grid_model.fit(ns_sd_X_train, ns_y_train)

In [None]:
tf_grid_model.fit(tf_sd_X_train, tf_y_train)

In [None]:
pj_grid_model.fit(pj_sd_X_train, pj_y_train)

In [None]:
# gridsearch attiribute
# ei_grid_model

best_ei_svm_model = ei_grid_model.best_estimator_
best_ei_param = ei_grid_model.best_params_
ei_svm_result = ei_grid_model.cv_results_

print(f"Best E&I SVM model: {best_ei_svm_model}")
print(f"Best E&I SVM Params: {best_ei_param}")

ei_svm_result_df = pd.DataFrame(ei_svm_result)
ei_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [None]:
# gridsearch attiribute
# ns_grid_model

best_ns_svm_model = ns_grid_model.best_estimator_
best_ns_param = ns_grid_model.best_params_
ns_svm_result = ns_grid_model.cv_results_

print(f"Best N&S SVM model: {best_ns_svm_model}")
print(f"Best N&S SVM Params: {best_ns_param}")

ns_svm_result_df = pd.DataFrame(ns_svm_result)
ns_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [None]:
# gridsearch attiribute
# tf_grid_model

best_tf_svm_model = tf_grid_model.best_estimator_
best_tf_param = tf_grid_model.best_params_
tf_svm_result = tf_grid_model.cv_results_

print(f"Best T&F SVM model: {best_tf_svm_model}")
print(f"Best T&F SVM Params: {best_tf_param}")

tf_svm_result_df = pd.DataFrame(tf_svm_result)
tf_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [None]:
# gridsearch attiribute
# pj_grid_model

best_pj_svm_model = pj_grid_model.best_estimator_
best_pj_param = pj_grid_model.best_params_
pj_svm_result = pj_grid_model.cv_results_

print(f"Best P&J SVM model: {best_pj_svm_model}")
print(f"Best P&J SVM Params: {best_pj_param}")

pj_svm_result_df = pd.DataFrame(pj_svm_result)
pj_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [70]:
best_ei_svm_train_score = best_ei_svm_model.score(ei_sd_X_train, ei_y_train)
best_ei_svm_test_score = best_ei_svm_model.score(ei_sd_X_test, ei_y_test)

best_ei_svm_pred = best_ei_svm_model.predict(ei_sd_X_test)

best_ei_svm_acc = accuracy_score(ei_y_test, best_ei_svm_pred)
best_ei_svm_mat = confusion_matrix(ei_y_test, best_ei_svm_pred)
best_ei_svm_report = classification_report(ei_y_test, best_ei_svm_pred, zero_division=0)

In [71]:
best_ns_svm_train_score = best_ns_svm_model.score(ns_sd_X_train, ns_y_train)
best_ns_svm_test_score = best_ns_svm_model.score(ns_sd_X_test, ns_y_test)

best_ns_svm_pred = best_ns_svm_model.predict(ns_sd_X_test)

best_ns_svm_acc = accuracy_score(ns_y_test, best_ns_svm_pred)
best_ns_svm_mat = confusion_matrix(ns_y_test, best_ns_svm_pred)
best_ns_svm_report = classification_report(ns_y_test, best_ns_svm_pred, zero_division=0)

In [72]:
best_tf_svm_train_score = best_tf_svm_model.score(tf_sd_X_train, tf_y_train)
best_tf_svm_test_score = best_tf_svm_model.score(tf_sd_X_test, tf_y_test)

best_tf_svm_pred = best_tf_svm_model.predict(tf_sd_X_test)

best_tf_svm_acc = accuracy_score(tf_y_test, best_tf_svm_pred)
best_tf_svm_mat = confusion_matrix(tf_y_test, best_tf_svm_pred)
best_tf_svm_report = classification_report(tf_y_test, best_tf_svm_pred, zero_division=0)

In [73]:
best_pj_svm_train_score = best_pj_svm_model.score(pj_sd_X_train, pj_y_train)
best_pj_svm_test_score = best_pj_svm_model.score(pj_sd_X_test, pj_y_test)

best_pj_svm_pred = best_pj_svm_model.predict(pj_sd_X_test)

best_pj_svm_acc = accuracy_score(pj_y_test, best_pj_svm_pred)
best_pj_svm_mat = confusion_matrix(pj_y_test, best_pj_svm_pred)
best_pj_svm_report = classification_report(pj_y_test, best_pj_svm_pred, zero_division=0)

In [None]:
print(f"Best E&I svm train score: {best_ei_svm_train_score*100:.2f} %")
print(f"Best E&I svm test score: {best_ei_svm_test_score*100:.2f} %\n")

print(f"Best E&I svm confusion matrix:\n{best_ei_svm_mat}\n")

print(f"Best E&I svm accuracy: {best_ei_svm_acc*100:.2f} %")
print(f"Best E&I svm classification report:\n{best_ei_svm_report}")

In [None]:
print(f"Best N&S svm train score: {best_ns_svm_train_score*100:.2f} %")
print(f"Best N&S svm test score: {best_ns_svm_test_score*100:.2f} %\n")

print(f"Best N&S svm confusion matrix:\n{best_ns_svm_mat}\n")

print(f"Best N&S svm accuracy: {best_ns_svm_acc*100:.2f} %")
print(f"Best N&S svm classification report:\n{best_ns_svm_report}")

In [None]:
print(f"Best T&F svm train score: {best_tf_svm_train_score*100:.2f} %")
print(f"Best T&F svm test score: {best_tf_svm_test_score*100:.2f} %\n")

print(f"Best T&F svm confusion matrix:\n{best_tf_svm_mat}\n")

print(f"Best T&F svm accuracy: {best_tf_svm_acc*100:.2f} %")
print(f"Best T&F svm classification report:\n{best_tf_svm_report}")

In [None]:
print(f"Best P&J svm train score: {best_pj_svm_train_score*100:.2f} %")
print(f"Best P&J svm test score: {best_pj_svm_test_score*100:.2f} %\n")

print(f"Best P&J svm confusion matrix:\n{best_pj_svm_mat}\n")

print(f"Best P&J svm accuracy: {best_pj_svm_acc*100:.2f} %")
print(f"Best P&J svm classification report:\n{best_pj_svm_report}")

- 오래 걸려서 안했던 kernel을 변경 시켜야 할듯... 튜닝의 의미가 없어졌음...
- 반전으로 kernel은 오래 걸리는 것이 아니었음 !!!

In [78]:
# 'kernel':['poly', 'rbf', 'sigmoid']
# 'max_iter':[-1, 0,  1]
# 'degree':[3, 4, 5, 6, 7, 8] --> 3
# 'decision_function_shape':['ovo', 'ovr'] --> ovo

params = {'C':[0.01, 0.05, 0.1, 0.5, 1.0], 'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
          'decision_function_shape':['ovo', 'ovr']}

In [79]:
# cv: 10
# randon_setate: 38

ei_kn_model = GridSearchCV(SVC(random_state=38), param_grid=params, cv=10, refit=True, return_train_score=True, verbose=2)
ns_kn_model = GridSearchCV(SVC(random_state=38), param_grid=params, cv=10, refit=True, return_train_score=True, verbose=2)
tf_kn_model = GridSearchCV(SVC(random_state=38), param_grid=params, cv=10, refit=True, return_train_score=True, verbose=2)
pj_kn_model = GridSearchCV(SVC(random_state=38), param_grid=params, cv=10, refit=True, return_train_score=True, verbose=2)

In [None]:
ei_kn_model.fit(ei_sd_X_train, ei_y_train)

In [None]:
ns_kn_model.fit(ns_sd_X_train, ns_y_train)

In [None]:
tf_kn_model.fit(tf_sd_X_train, tf_y_train)

In [None]:
pj_kn_model.fit(pj_sd_X_train, pj_y_train)

In [None]:
# gridsearch attiribute
# ei_kn_model

best_kn_ei_svm_model = ei_kn_model.best_estimator_
best_kn_ei_param = ei_kn_model.best_params_
ei_kn_svm_result = ei_kn_model.cv_results_

print(f"Best E&I SVM model: {best_kn_ei_svm_model}")
print(f"Best E&I SVM Params: {best_kn_ei_param}")

ei_kn_svm_result_df = pd.DataFrame(ei_kn_svm_result)
ei_kn_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [None]:
# gridsearch attiribute
# ns_kn_model

best_kn_ns_svm_model = ns_kn_model.best_estimator_
best_kn_ns_param = ns_kn_model.best_params_
ns_kn_svm_result = ns_kn_model.cv_results_

print(f"Best N&S SVM model: {best_kn_ns_svm_model}")
print(f"Best N&S SVM Params: {best_kn_ns_param}")

ns_kn_svm_result_df = pd.DataFrame(ns_kn_svm_result)
ns_kn_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [None]:
# gridsearch attiribute
# tf_kn_model

best_kn_tf_svm_model = tf_kn_model.best_estimator_
best_kn_tf_param = tf_kn_model.best_params_
tf_kn_svm_result = tf_kn_model.cv_results_

print(f"Best T&F SVM model: {best_kn_tf_svm_model}")
print(f"Best T&F SVM Params: {best_kn_tf_param}")

tf_kn_svm_result_df = pd.DataFrame(tf_kn_svm_result)
tf_kn_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [None]:
# gridsearch attiribute
# pj_kn_model

best_kn_pj_svm_model = pj_kn_model.best_estimator_
best_kn_pj_param = pj_kn_model.best_params_
pj_kn_svm_result = pj_kn_model.cv_results_

print(f"Best P&J SVM model: {best_kn_pj_svm_model}")
print(f"Best P&J SVM Params: {best_kn_pj_param}")

pj_kn_svm_result_df = pd.DataFrame(pj_kn_svm_result)
pj_kn_svm_result_df[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(10)

In [90]:
best_kn_ei_svm_train_score = best_kn_ei_svm_model.score(ei_sd_X_train, ei_y_train)
best_kn_ei_svm_test_score = best_kn_ei_svm_model.score(ei_sd_X_test, ei_y_test)

best_kn_ei_svm_pred = best_ei_svm_model.predict(ei_sd_X_test)

best_kn_ei_svm_acc = accuracy_score(ei_y_test, best_kn_ei_svm_pred)
best_kn_ei_svm_mat = confusion_matrix(ei_y_test, best_kn_ei_svm_pred)
best_kn_ei_svm_report = classification_report(ei_y_test, best_kn_ei_svm_pred, zero_division=0)

In [91]:
best_kn_ns_svm_train_score = best_kn_ns_svm_model.score(ns_sd_X_train, ns_y_train)
best_kn_ns_svm_test_score = best_kn_ns_svm_model.score(ns_sd_X_test, ns_y_test)

best_kn_ns_svm_pred = best_kn_ns_svm_model.predict(ns_sd_X_test)

best_kn_ns_svm_acc = accuracy_score(ns_y_test, best_kn_ns_svm_pred)
best_kn_ns_svm_mat = confusion_matrix(ns_y_test, best_kn_ns_svm_pred)
best_kn_ns_svm_report = classification_report(ns_y_test, best_ns_svm_pred, zero_division=0)

In [92]:
best_kn_tf_svm_train_score = best_kn_tf_svm_model.score(tf_sd_X_train, tf_y_train)
best_kn_tf_svm_test_score = best_kn_tf_svm_model.score(tf_sd_X_test, tf_y_test)

best_kn_tf_svm_pred = best_tf_svm_model.predict(tf_sd_X_test)

best_kn_tf_svm_acc = accuracy_score(tf_y_test, best_kn_tf_svm_pred)
best_kn_tf_svm_mat = confusion_matrix(tf_y_test, best_kn_tf_svm_pred)
best_kn_tf_svm_report = classification_report(tf_y_test, best_kn_tf_svm_pred, zero_division=0)

In [93]:
best_kn_pj_svm_train_score = best_kn_pj_svm_model.score(pj_sd_X_train, pj_y_train)
best_kn_pj_svm_test_score = best_kn_pj_svm_model.score(pj_sd_X_test, pj_y_test)

best_kn_pj_svm_pred = best_pj_svm_model.predict(pj_sd_X_test)

best_kn_pj_svm_acc = accuracy_score(pj_y_test, best_kn_pj_svm_pred)
best_kn_pj_svm_mat = confusion_matrix(pj_y_test, best_kn_pj_svm_pred)
best_kn_pj_svm_report = classification_report(pj_y_test, best_kn_pj_svm_pred, zero_division=0)

In [None]:
print(f"Best kernel E&I svm train score: {best_kn_ei_svm_train_score*100:.2f} %")
print(f"Best kernel E&I svm test score: {best_kn_ei_svm_test_score*100:.2f} %\n")

print(f"Best kernel E&I svm confusion matrix:\n{best_kn_ei_svm_mat}\n")

print(f"Best kernel E&I svm accuracy: {best_kn_ei_svm_acc*100:.2f} %")
print(f"Best kernel E&I svm classification report:\n{best_kn_ei_svm_report}")

In [None]:
print(f"Best kernel N&S svm train score: {best_kn_ns_svm_train_score*100:.2f} %")
print(f"Best kernel N&S svm test score: {best_kn_ns_svm_test_score*100:.2f} %\n")

print(f"Best kernel N&S svm confusion matrix:\n{best_kn_ns_svm_mat}\n")

print(f"Best kernel N&S svm accuracy: {best_kn_ns_svm_acc*100:.2f} %")
print(f"Best kernel N&S svm classification report:\n{best_kn_ns_svm_report}")

In [None]:
print(f"Best kernel T&F svm train score: {best_kn_tf_svm_train_score*100:.2f} %")
print(f"Best kernel T&F svm test score: {best_kn_tf_svm_test_score*100:.2f} %\n")

print(f"Best kernel T&F svm confusion matrix:\n{best_kn_tf_svm_mat}\n")

print(f"Best kernel T&F svm accuracy: {best_kn_tf_svm_acc*100:.2f} %")
print(f"Best kernel T&F svm classification report:\n{best_kn_tf_svm_report}")

In [None]:
print(f"Best kernel P&J svm train score: {best_kn_pj_svm_train_score*100:.2f} %")
print(f"Best kernel P&J svm test score: {best_kn_pj_svm_test_score*100:.2f} %\n")

print(f"Best kernel P&J svm confusion matrix:\n{best_kn_pj_svm_mat}\n")

print(f"Best kernel P&J svm accuracy: {best_kn_pj_svm_acc*100:.2f} %")
print(f"Best kernel P&J svm classification report:\n{best_kn_pj_svm_report}")

In [110]:
save_model(ei_sd_scaler,'./scaler/ei_sd_scaler')
save_model(ns_sd_scaler,'./scaler/ns_sd_scaler')
save_model(tf_sd_scaler,'./scaler/tf_sd_scaler')
save_model(pj_sd_scaler,'./scaler/pj_sd_scaler')

In [111]:
save_model(best_kn_ei_svm_model,'./model/ei_model')
save_model(best_kn_ns_svm_model,'./model/ns_model')
save_model(best_kn_tf_svm_model,'./model/tf_model')
save_model(best_kn_pj_svm_model,'./model/pj_model')

- thresholld를 바꿔보자

In [98]:
from sklearn.preprocessing import Binarizer

In [99]:
def get_eval_by_threshold(model, y_test, pred_prob, thresholds):
    for custom_thresholod in thresholds:
        binarizer = Binarizer(threshold=custom_thresholod).fit(pred_prob)
        custom_pred = binarizer.transform(pred_prob)
        
        acc = accuracy_score(y_test, custom_pred)
        mat = confusion_matrix(y_test,custom_pred)
        report = classification_report(y_test, custom_pred, zero_division=0)
        
        print(f"threshold: {custom_thresholod}")
        print(f"{model} confusion matrix:\n{mat}\n")
        print(f"{model} accuracy: {acc*100:.2f} %")
        print(f"{model} classification report:\n{report}\n\n")

In [100]:
# label_ei = data_df['ei_code']
# label_ns = data_df['ns_code']
# label_tf = data_df['tf_code']
# label_pj = data_df['pj_code']

label_ei = data_df['ei']
label_ns = data_df['ns']
label_tf = data_df['tf']
label_pj = data_df['pj']

In [101]:
ei_X_train, ei_X_test, ei_y_train, ei_y_test = train_test_split(feature_df, label_ei, test_size=0.2, stratify=label_ei, random_state=38)
ns_X_train, ns_X_test, ns_y_train, ns_y_test = train_test_split(feature_df, label_ns, test_size=0.2, stratify=label_ns, random_state=38)
tf_X_train, tf_X_test, tf_y_train, tf_y_test = train_test_split(feature_df, label_tf, test_size=0.2, stratify=label_tf, random_state=38)
pj_X_train, pj_X_test, pj_y_train, pj_y_test = train_test_split(feature_df, label_pj, test_size=0.2, stratify=label_pj, random_state=38)

In [102]:
ei_sd_scaler = StandardScaler()
ei_sd_scaler.fit(ei_X_train, ei_y_train)

ei_sd_X_train = ei_sd_scaler.transform(ei_X_train)
ei_sd_X_test = ei_sd_scaler.transform(ei_X_test)

In [103]:
ns_sd_scaler = StandardScaler()
ns_sd_scaler.fit(ns_X_train, ns_y_train)

ns_sd_X_train = ns_sd_scaler.transform(ns_X_train)
ns_sd_X_test = ns_sd_scaler.transform(ns_X_test)

In [104]:
tf_sd_scaler = StandardScaler()
tf_sd_scaler.fit(tf_X_train, tf_y_train)

tf_sd_X_train = tf_sd_scaler.transform(tf_X_train)
tf_sd_X_test = tf_sd_scaler.transform(tf_X_test)

In [105]:
pj_sd_scaler = StandardScaler()
pj_sd_scaler.fit(pj_X_train, pj_y_train)

pj_sd_X_train = pj_sd_scaler.transform(pj_X_train)
pj_sd_X_test = pj_sd_scaler.transform(pj_X_test)

In [None]:
ei_model = SVC(C=1.0, decision_function_shape='ovo', probability=True, kernel='rbf')
ei_model.fit(ei_sd_X_train, ei_y_train)

In [None]:
ei_train_score = ei_model.score(ei_sd_X_train, y_train)
ei_test_score = ei_model.score(ei_sd_X_test, y_test)

print(f"ei model tarin score: {ei_train_score*100:.2f} %")
print(f"ei model test score: {ei_test_score*100:.2f} %")

In [None]:
ei_proba = ei_model.predict_proba(ei_sd_X_test)
thresholds = [x/10 for x in range(1, 11)]
get_eval_by_threshold(ei_model, ei_y_test, ei_proba[:,1].reshape(-1,1), thresholds)

In [None]:
thresholds = [x for x in range(1, 11)]
get_eval_by_threshold(ei_model, ei_y_test, ei_proba[:,1].reshape(-1,1), thresholds)

In [None]:
ns_model = SVC(C=0.05, decision_function_shape='ovo', probability=True, kernel='poly')
ns_model.fit(ns_sd_X_train, ns_y_train)

ns_train_score = ns_model.score(ns_sd_X_train, y_train)
ns_test_score = ns_model.score(ns_sd_X_test, y_test)

print(f"ns model tarin score: {ns_train_score*100:.2f} %")
print(f"ns model test score: {ns_test_score*100:.2f} %")

In [None]:
ns_proba = ns_model.predict_proba(ns_sd_X_test)
thresholds = [x/10 for x in range(1, 11)]
thresholds2 = [x for x in range(1, 11)]

get_eval_by_threshold(ns_model, ns_y_test, ns_proba[:,1].reshape(-1,1), thresholds)
get_eval_by_threshold(ns_model, ns_y_test, ns_proba[:,1].reshape(-1,1), thresholds2)