### Naive Bayes Classifier Task
### 문장에서 느껴지는 감정 예측
##### 다중 분류(Multiclass Classification)
- 비대면 심리 상담사로서 메세지를 전달한 환자에 대한 감정 데이터를 수집했다.
- 각 메세지 별로 감정이 표시되어 있다.
- 미래에 동일한 메세지를 보내는 환자에게 어떤 심리 치료가 적합할 수 있는지 알아보기 위한 모델을 구축한다.

##### 🚩제시된 feature에 알맞은 target이 나올 수 있게 훈련한다.  
- 'Sweat deer': love  
- 'The moment I saw her, I realized something was wrong.': sadness

In [1]:
import pandas as pd
ms_df = pd.read_csv('./datasets/feeling.csv',sep=';')
ms_df
ms_df["feeling"].value_counts()

feeling
joy         6066
sadness     5216
anger       2434
fear        2149
love        1482
surprise     653
Name: count, dtype: int64

In [2]:
ms_df.describe().T
ms_df["feeling"].value_counts()

feeling
joy         6066
sadness     5216
anger       2434
fear        2149
love        1482
surprise     653
Name: count, dtype: int64

In [3]:
ms_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  18000 non-null  object
 1   feeling  18000 non-null  object
dtypes: object(2)
memory usage: 281.4+ KB


In [4]:
from sklearn.preprocessing import LabelEncoder
feeling_encoder = LabelEncoder()
feeling_encoded= feeling_encoder.fit_transform(ms_df["feeling"])
ms_df["feeling"] = feeling_encoded
ms_df
feeling_encoder.classes_

array(['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
      dtype=object)

In [5]:
from sklearn.model_selection import train_test_split

#샘플링
samples= []
for i in range(6):
    
    sample = ms_df[ms_df["feeling"]==i].sample(653)
    samples.append(sample)

sample_df = pd.concat(samples)
display(sample_df)

features = sample_df["message"]
targets = sample_df["feeling"]


X_train,X_test,y_train,y_test = train_test_split(features,targets,stratify=targets,random_state=124,test_size=0.1)


Unnamed: 0,message,feeling
3293,i feel that it is dangerous to portray angels ...,0
17283,when reading a newspaper story of a man who ha...,0
15133,i feel like it but i cant i cant give in i am ...,0
7941,i feel so impatient and sometimes i feel thank...,0
7231,i guess you could say i am teeter totering rig...,0
...,...,...
11968,i feel like itd be strange at the least and po...,5
8803,i started feeling dazed,5
4991,i also feel amazed happy fortunate and extreme...,5
324,i saw him on galaxies magazine i feel curious ...,5


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

countVectorizer = CountVectorizer()
x_train_vectorized = countVectorizer.fit_transform(X_train)

x_train_vectorized


navie_bayes_pipeline = Pipeline([('count_vectorizer', CountVectorizer()), ('naive_bayes', MultinomialNB())])
navie_bayes_pipeline.fit(X_train, y_train)

In [7]:
y_pred = navie_bayes_pipeline.predict(X_test)
print(X_test.index)
ms_df_test = sample_df.loc[X_test.index]
ms_df_test["y_pred"] = y_pred
ms_df_test

navie_bayes_pipeline.score(X_test, y_test)

Index([ 6605, 13666,  9023, 10320, 13500, 13842, 13584,  4895,  4781,  7786,
       ...
        7681,   583,  6807, 11435, 10708,  7980,  8541,  6900,  1584,  6724],
      dtype='int64', length=392)


0.7346938775510204

In [8]:
navie_bayes_pipeline.predict_proba(X_test)

array([[0.75853456, 0.02553766, 0.06302847, 0.06647645, 0.05252241,
        0.03390046],
       [0.05122867, 0.01233767, 0.2393093 , 0.61077527, 0.00513439,
        0.0812147 ],
       [0.03155106, 0.05604885, 0.15265582, 0.00777135, 0.35992419,
        0.39204874],
       ...,
       [0.00694071, 0.00543784, 0.01330527, 0.97049469, 0.00199376,
        0.00182772],
       [0.91855715, 0.0290406 , 0.00930506, 0.0179188 , 0.01886226,
        0.00631612],
       [0.08985015, 0.08787297, 0.25680471, 0.07949003, 0.04665726,
        0.43932489]])

In [9]:
# from sklearn.metrics import accuracy_score, precision_score , recall_score , confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score
# import matplotlib.pyplot as plt
# # 타겟 데이터와 예측 객체를 전달받는다.
# def get_evaluation(y_test, prediction, classifier=None, X_test=None):
# #     오차 행렬
#     confusion = confusion_matrix(y_test, prediction)
# #     정확도
#     accuracy = accuracy_score(y_test , prediction,average='macro')
# #     정밀도
#     precision = precision_score(y_test , prediction,average='macro')
# #     재현율
#     recall = recall_score(y_test , prediction,average='macro')
# #     F1 score
#     f1 = f1_score(y_test, prediction,average='macro')
# #     ROC-AUC
#     roc_auc = roc_auc_score(y_test, prediction,average='macro')

#     print('오차 행렬')
#     print(confusion)
#     print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}, AUC:{4:.4f}'.format(accuracy , precision ,recall, f1, roc_auc))
#     print("#" * 75)
    
#     if classifier is not None and  X_test is not None:
#         fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8,4))
#         titles_options = [("Confusion matrix", None), ("Normalized confusion matrix", "true")]

#         for (title, normalize), ax in zip(titles_options, axes.flatten()):
#             disp = ConfusionMatrixDisplay.from_estimator(classifier, X_test, y_test, ax=ax, cmap=plt.cm.Blues, normalize=normalize)
#             disp.ax_.set_title(title)
#         plt.show()

In [10]:
# get_evaluation(y_test, y_pred, navie_bayes_pipeline, X_test)

In [46]:

print(feeling_encoder.classes_)
# navie_bayes_pipeline.predict(["Sweat deer","The moment I saw her, I realized something was wrong."])
# navie_bayes_pipeline.predict(["The moment I saw her, I realized something was wrong."])
navie_bayes_pipeline.predict(["i love you","oops","oh my god","really?","are you serious?","i think you are wrong","i'm so gloomy",
                              "i got what i want","coffee is good","i'm good","good to see you","crazy acade love sad angry"])

['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


array([3, 0, 3, 5, 2, 3, 4, 0, 2, 2, 2, 4])