In [478]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import torch.optim as optim
from embedding import Embedding
from collections import Counter
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from embedding import Embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_validate, ShuffleSplit
from sklearn.metrics import accuracy_score



In [479]:
data = pd.read_csv('./real_comment1.csv')

In [480]:
def seed_everything(seed):
    np.random.seed(seed)

seed_everything(42)  # Seed 고정


In [481]:
data = data.dropna()
labels = []

In [482]:
for i in data['mbti']:
    if "E" in str(i):
        labels.append(0)
    else:
        labels.append(1)
        
labels = pd.DataFrame(labels)
data = pd.concat([data, labels], axis=1)
data.rename(columns={ 0: 'labels'}, inplace=True)

In [483]:
data = data.dropna()
data = data.sample(frac=1).reset_index(drop=True)
x_train = data['comments']
y_train = data['labels']

In [484]:
from imblearn.under_sampling import RandomUnderSampler
x_train_reshaped = np.array(x_train).reshape(-1, 1)
x_train_sample, y_train_sample = RandomUnderSampler(random_state=2021).fit_resample(x_train_reshaped, y_train)

combined = list(zip(x_train_sample, y_train_sample))
np.random.shuffle(combined)
x_train_sample, y_train_sample = zip(*combined)
x_train_sample = np.array(x_train_sample)
y_train_sample = pd.DataFrame({'y': y_train_sample})


In [485]:
x_train_sample = x_train_sample.tolist()
x_train_sample = [' '.join(sentences) for sentences in x_train_sample]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(x_train_sample)

In [486]:
# 데이터의 전체 길이
total_length = len(x_train_sample)
# 80% 비율 계산
train_length = int(total_length * 0.8)

# 데이터 분할
x_train = tfidf_matrix[:train_length]
y_train = y_train_sample[:train_length]
x_temp = tfidf_matrix[train_length:]
y_temp = y_train_sample[train_length:]

# val 데이터의 전체 길이
val_length = total_length - train_length
# 50% 비율 계산
val_length = int(val_length * 0.5)



In [487]:
x_val = x_temp[val_length:]
x_test = x_temp[:val_length]
y_val = y_temp[val_length:]
y_test = y_temp[val_length:]

In [488]:
y_train

Unnamed: 0,y
0,1.0
1,1.0
2,1.0
3,1.0
4,0.0
...,...
31711,1.0
31712,1.0
31713,1.0
31714,1.0


In [489]:
y_val.value_counts()

y  
1.0    2001
0.0    1964
Name: count, dtype: int64

In [490]:
y_test.value_counts()

y  
1.0    2001
0.0    1964
Name: count, dtype: int64

In [491]:
from catboost import CatBoostClassifier

best_score = 0
best_model = None

for i in range(10):
    # CatBoost 모델 생성
    catboost_model = CatBoostClassifier(iterations=1000, early_stopping_rounds=10)
    
    # 모델 학습
    catboost_model.fit(x_train, y_train, eval_set=(x_val, y_val), verbose=False)
    
    # Validation 데이터에 대한 정확도 계산
    catboost_preds = catboost_model.predict(x_val)
    catboost_score = accuracy_score(y_val, catboost_preds)
    print(catboost_preds)
    print(catboost_score)
    # 현재 모델의 성능이 이전 최고 성능보다 좋은지 확인
    if catboost_score > best_score:
        best_score = catboost_score
        best_model = catboost_model

# 최고 성능의 모델 출력
print(f'Best CatBoost Validation Score: {best_score}')
print(f'Best CatBoost Model: {best_model}')


# 테스트 데이터 예측
test_preds = best_model.predict(x_test)
accuracy = np.sum(test_preds == y_test) / len(y_test)


[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435
[0. 0. 0. ... 0. 0. 0.]
0.5008827238335435


In [None]:
print(accuracy)

In [None]:
print(x_train)

  (0, 9968)	0.38227529263957843
  (0, 13610)	0.3556062014908482
  (0, 11828)	0.32605802536779904
  (0, 1780)	0.5902234640669622
  (0, 2714)	0.5222377395943555
  (1, 7268)	0.21940078434757887
  (1, 6119)	0.35121748551629267
  (1, 1780)	0.6816883502576078
  (1, 2714)	0.603167113508637
  (2, 14404)	0.3667768250864863
  (2, 1097)	0.5006474126252586
  (2, 3213)	0.37327954112365785
  (2, 13021)	0.5642020708594914
  (2, 15598)	0.23292844007159214
  (2, 598)	0.3207953834351406
  (3, 13859)	0.8248173890812652
  (3, 14404)	0.5653992170751254
  (4, 5487)	1.0
  (5, 5361)	0.3244489260161152
  (5, 12270)	0.3504731474318233
  (5, 12581)	0.3933311318440236
  (5, 1067)	0.4309661425449059
  (5, 8523)	0.44719475766119954
  (5, 5487)	0.44035933099562957
  (5, 7268)	0.19380603799651552
  :	:
  (41241, 11328)	0.19365482851697285
  (41241, 142)	0.17351996182154605
  (41241, 9547)	0.1596945693056709
  (41241, 4725)	0.1539325524665391
  (41241, 7541)	0.14407758913052798
  (41241, 287)	0.1168400567051478
  (412

In [None]:
models = cross_validate(CatBoostClassifier(verbose=False, random_state=42),
                        x_train, y_train, 
                        cv=5, 
                        scoring='neg_mean_squared_error', 
                        return_estimator=True)
oof_pred = np.array([m.predict(y_test) for m in models['estimator']]).mean(axis=0)

scores = models['test_score']
print("\nCatBoost CV scores: ", np.sqrt(-1*scores))
print("CatBoost CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from datetime import datetime
from math import sqrt

def seed_everything(seed):
    np.random.seed(seed)

seed_everything(42)  # Seed 고정

# 데이터 불러오기
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")


# 학습에 사용하지 않을 변수들을 제거합니다
x_train = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
y_train = train_df['price(원/kg)']

x_test = test_df.drop(columns=['ID', 'timestamp'])

# 질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    x_train[i] = le.fit_transform(x_train[i])
    x_test[i] = le.transform(x_test[i])  # test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

# 데이터 분할
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# RandomForest 모델 학습 및 예측
rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)
rf_preds = rf_model.predict(x_val)

rf_rmse = sqrt(mean_squared_error(y_val, rf_preds))
print(f'RandomForest Validation RMSE: {rf_rmse}')

# Ridge 모델 학습 및 예측
ridge_model = Ridge()
ridge_model.fit(x_train, y_train)
ridge_preds = ridge_model.predict(x_val)

ridge_rmse = sqrt(mean_squared_error(y_val, ridge_preds))
print(f'Ridge Validation RMSE: {ridge_rmse}')

# CatBoost 모델 학습 및 예측
catboost_model = CatBoostRegressor()
catboost_model.fit(x_train, y_train)
catboost_preds = catboost_model.predict(x_val)

catboost_rmse = sqrt(mean_squared_error(y_val, catboost_preds))
print(f'CatBoost Validation RMSE: {catboost_rmse}')

# 가장 좋은 성능을 보인 모델 선택
best_model = rf_model
best_rmse = rf_rmse

if ridge_rmse < best_rmse:
    best_model = ridge_model
    best_rmse = ridge_rmse

if catboost_rmse < best_rmse:
    best_model = catboost_model
    best_rmse = catboost_rmse

# 테스트 데이터 예측
test_preds = best_model.predict(x_test)

# 결과를 sample_submission 양식에 맞게 저장
submission['answer'] = test_preds

# 파일 저장
time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
post_name = f"./submission_{time}.csv"
submission.to_csv(post_name, index=False, encoding="utf-8")

FileNotFoundError: [Errno 2] No such file or directory: './data/train.csv'