In [21]:
import math, sys
from konlpy.tag import Twitter
import json

class BayesianFilter:
    def __init__(self):
        self.words = set()
        self.word_dict = {}
        self.category_dict = {}
        
    def split(self, text):
        results = []
        twitter = Twitter()
        malist = twitter.pos(text, norm=True, stem=True)
        for word in malist:
            if not word[1] in ["Josa", "Eomi", "Punctuation", "Foreign", "Number", "Alpha"]:
                results.append(word[0])
        return results
    
    def inc_word(self, word, category):
        if not category in self.word_dict:
            self.word_dict[category] = {}
        if not word in self.word_dict[category]:
            self.word_dict[category][word] = 0
        self.word_dict[category][word] += 1
        self.words.add(word)
    
    def inc_category(self, category):
        if not category in self.category_dict:
            self.category_dict[category] = 0
        self.category_dict[category] += 1
        
    def fit(self, text, category):
        word_list = self.split(text)
        for word in word_list:
            self.inc_word(word, category)
        self.inc_category(category)
        
    def score(self, words, category):
        score = math.log(self.category_prob(category))
        for word in words:
            score += math.log(self.word_prob(word, category))
        return score
    
    def predict(self, text):
        best_category = None
        max_score = -sys.maxsize
        words = self.split(text)
        score_list = []
        for category in self.category_dict.keys():
            score = self.score(words, category)
            score_list.append((category, score))
            if score > max_score:
                max_score = score
                best_category = category
        return best_category, score_list
    
    def get_word_count(self, word, category):
        if word in self.word_dict[category]:
            return self.word_dict[category][word]
        else:
            return 0
        
    def category_prob(self, category):
        sum_categories = sum(self.category_dict.values())
        category_v = self.category_dict[category]
        return category_v / sum_categories
    
    def word_prob(self, word, category):
        n = self.get_word_count(word, category) + 1
        d = sum(self.word_dict[category].values()) + len(self.words)
        return n /d

In [22]:
# 데이터 정규화하기

with open('익스포트_100.csv') as file:
    csv_data = []
    for line in file.readlines():
        csv_data.append(line.split(','))

regular_data = [["" for x in range(2)] for y in range(len(csv_data))]
regular_label = [["" for x in range(2)] for y in range(len(csv_data))]
print(len(csv_data))

for i in range(1, len(csv_data)):
    regular_data[i][0] = csv_data[i][2] #mpname
    regular_data[i][0] += csv_data[i][3] #mpstand
    regular_data[i][1] = csv_data[i][3] #mpstand
    
    regular_label[i][0] = csv_data[i][6] #pname
    regular_label[i][0] += csv_data[i][10] #pmake
    regular_label[i][1] = csv_data[i][10] #pmake



46134


In [23]:
train_length = len(csv_data) -12;
test_length = 12;
bf = BayesianFilter()
bf_make = BayesianFilter()

In [24]:
for i in range(1, train_length):
    bf.fit(regular_data[i][0], regular_label[i][0]);
    bf_make.fit(regular_data[i][1], regular_label[i][1]);

In [34]:
for i in range(train_length + 1, train_length + test_length):
    pre, scorelist = bf.predict(regular_data[i][0])
    pre_make, scorelist_make = bf_make.predict(regular_data[i][1])
    posibility = scorelist[0][1] 
    position = 0
    posibility_make = scorelist_make[0][1] 
    position_make = 0
    
    print("입력값 = ", regular_data[i][0])
    print("추측 결과 = ", pre)
    print("실제 결과 = ", regular_label[i][0])
    
    for j in range(1, len(scorelist)):
        if (posibility < scorelist[j][1]):
            posibility = scorelist[j][1]
            position = j
    
    print("입력 메이커 = ", regular_data[i][1])
    print("추측 메이커 = ", pre_make)
    print("실제 메이커 = ", regular_label[i][1])
    for j in range(1, len(scorelist_make)):
        if (posibility_make < scorelist_make[j][1]):
            posibility_make = scorelist_make[j][1]
            position_make = j
    
    print("결과 = ", scorelist[position], scorelist_make[position_make])

    print("\n")
    

입력값 =  옥수수통조림/가당2.10kg 스위트콘/ 2.1kg/캔/ 원산지 및 유통기한 표시/오뚜기/이츠웰
추측 결과 =  스위트콘오뚜기
실제 결과 =  스위트콘오뚜기
입력 메이커 =  2.10kg 스위트콘/ 2.1kg/캔/ 원산지 및 유통기한 표시/오뚜기/이츠웰
추측 메이커 =  오뚜기
실제 메이커 =  오뚜기
결과 =  ('스위트콘오뚜기', -58.375670235701406) ('오뚜기', -47.88746868931776)


입력값 =  맛술(미림)/맛술(미림)1.80l 미향/ 오뚜기/ 1.8L/통/ 유통기한표시
추측 결과 =  미향오뚜기
실제 결과 =  미향오뚜기
입력 메이커 =  1.80l 미향/ 오뚜기/ 1.8L/통/ 유통기한표시
추측 메이커 =  오뚜기
실제 메이커 =  오뚜기
결과 =  ('미향오뚜기', -43.30090041322326) ('오뚜기', -32.69292043593638)


입력값 =  팽이버섯 / 생것(친환경농산물) 친환경/ 무농약/국산/상품/갓이피어있지않고탄력이있는것/진공포장
추측 결과 =  친환경팽이버섯
실제 결과 =  친환경팽이버섯
입력 메이커 =  (친환경농산물) 친환경/ 무농약/국산/상품/갓이피어있지않고탄력이있는것/진공포장
추측 메이커 =  
실제 메이커 =  
결과 =  ('친환경팽이버섯', -83.5189793228511) ('', -88.49433659143854)


입력값 =  피망 / 녹색과(일반농산물) 국산/청피망/상품/윤택이나고단단한것
추측 결과 =  청피망
실제 결과 =  청피망
입력 메이커 =  (일반농산물) 국산/청피망/상품/윤택이나고단단한것
추측 메이커 =  
실제 메이커 =  
결과 =  ('청피망', -47.7492074239179) ('', -48.443148778886176)


입력값 =  표고버섯 / 참나무/생것(생것)(일반농산물) 국산/상품/갓이둥그렇고탄력이있는것/신선
추측 결과 =  표고버섯
실제 결과 =  표고버섯
입력 메이커 =  (일반농산물) 국산/상품/갓이둥그렇고탄력이있는

In [35]:
data = input()
data_make = input()

pre, scorelist = bf.predict(data)
pre_make, scorelist_make = bf_make.predict(data_make)
posibility = scorelist[0][1] 
position = 0
posibility_make = scorelist_make[0][1] 
position_make = 0
    
print("입력값 = ", data)
print("추측 결과 = ", pre)
    
for j in range(1, len(scorelist)):
    if (posibility < scorelist[j][1]):
        posibility = scorelist[j][1]
        position = j
    
print("입력 메이커 = ", data_make)
print("추측 메이커 = ", pre_make)
for j in range(1, len(scorelist_make)):
    if (posibility_make < scorelist_make[j][1]):
        posibility_make = scorelist_make[j][1]
        position_make = j
    
print("결과 = ", scorelist[position], scorelist_make[position_make])

사과
풀무원
입력값 =  사과
추측 결과 =  2배사과식초오뚜기
입력 메이커 =  풀무원
추측 메이커 =  씨제이
결과 =  ('2배사과식초오뚜기', -7.50621225916094) ('씨제이', -7.02418683436164)


In [9]:
'''
import pickle
bf_test = BayesianFilter()
bf_make_test = BayesianFilter()

with open('bf_word_dict.pickle', 'wb') as handle:
    pickle.dump(bf.word_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open('bf_category_dict.pickle', 'wb') as handle:
    pickle.dump(bf.category_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open('bf_make_word_dict.pickle', 'wb') as handle:
    pickle.dump(bf_make.word_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open('bf_make_category_dict.pickle', 'wb') as handle:
    pickle.dump(bf_make.category_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open('bf_word_dict.pickle', 'rb') as handle:
    bf_test.word_dict = pickle.load(handle)

with open('bf_category_dict.pickle', 'rb') as handle:
    bf_test.category_dict = pickle.load(handle)

with open('bf_make_word_dict.pickle', 'rb') as handle:
    bf_make_test.word_dict = pickle.load(handle)

with open('bf_make_category_dict.pickle', 'rb') as handle:
    bf_make_test.category_dict = pickle.load(handle)
'''