### 1. data_preprocessing

In [1]:
import pandas as pd
from konlpy.tag import Okt

In [2]:
okt = Okt()

In [3]:
def txt_to_csv(filepath):
    df = pd.DataFrame(columns=["sentence", "emotion"])
    with open(filepath, 'r') as f:
        s = f.readlines()
        for i, data in enumerate(s):
            df.loc[i] = [data.split(";")[0], data.split(";")[1].replace("\n", "")]
    return df
            

In [4]:
train_data = txt_to_csv("./datasets/train.txt")
test_data = txt_to_csv("./datasets/test.txt")
val_data = txt_to_csv("./datasets/val.txt")

In [5]:
data = pd.concat([train_data, test_data, val_data])
data.head()

Unnamed: 0,sentence,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [6]:
def emotion_to_num(x):
    if x == "anger": return 0
    elif x == "fear": return 1
    elif x == "joy": return 2
    elif x == "love": return 3
    elif x == "sadness": return 4
    elif x == "surprise": return 5
    
data["emotion"] = data["emotion"].apply(lambda x: emotion_to_num(x))
data.head()

Unnamed: 0,sentence,emotion
0,i didnt feel humiliated,4
1,i can go from feeling so hopeless to so damned...,4
2,im grabbing a minute to post i feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,0


In [7]:
data.to_csv("./datasets/data.csv")

### 2. Main(fit, protect)

In [8]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("./datasets/data.csv", index_col=0)
data.head()

features = data["sentence"]
label = data["emotion"]
train_x, test_x, train_y, test_y = train_test_split(features, label, test_size=0.2, random_state=0)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(tokenizer = okt.morphs, ngram_range=(1, 2), min_df=3, max_df=0.9)
tfv.fit(train_x)
tfv_train_x = tfv.transform(train_x)
tfv_train_x



<16000x19806 sparse matrix of type '<class 'numpy.float64'>'
	with 426702 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

clf = LogisticRegression(random_state=0)
params = {"C": [1, 3, 5, 7, 9]}
grid_cv = GridSearchCV(clf, param_grid=params, cv = 3, scoring="accuracy", verbose=1)
grid_cv.fit(tfv_train_x, train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/pre

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=3, estimator=LogisticRegression(random_state=0),
             param_grid={'C': [1, 3, 5, 7, 9]}, scoring='accuracy', verbose=1)

In [15]:
grid_cv.best_params_

{'C': 9}

In [16]:
grid_cv.best_score_

0.8473127175435341

In [17]:
tfv_test_x = tfv.transform(test_x)
grid_cv.best_estimator_.score(tfv_test_x, test_y)

0.8715

In [29]:
def num_to_emotion(x):
    if x == 0: return "anger"
    elif x == 1: return "fear"
    elif x == 2: return "joy"
    elif x == 3: return "love"
    elif x == 4: return "sadness"
    elif x == 5: return "surprise"
    
a = ["oh my... oh no"]
sentence = tfv.transform(a)
emotion = num_to_emotion(grid_cv.best_estimator_.predict(sentence)[0])
emotion

'sadness'