# 1. Import Libraries

In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)

warnings.filterwarnings("ignore")

np.random.seed(12345)



# 2. Read Data

In [2]:
data = pd.read_excel("news_dataset.xlsx")

data

Unnamed: 0,title,body_text,labels
0,"นักวิจัยหนุน ""แม้ว"" เปิด ""จีเอ็มโอ""",ประชาไท --- 23 ส.ค.2547 นักวิจัยฯ ชี้นโยบายจี...,environment
1,ภาคประชาชนต้านเปิดเสรีจีเอ็มโอ,ประชาไท- 23 ส.ค.2547 นักวิชาการ ภาคประชาชน จ...,environment
2,จุฬาฯ ห่วงจีเอ็มโอลามข้าวไทย,นโยบายที่อนุญาตให้ปลูกร่วมกับพืชอื่นได้นั้นถื...,environment
3,ฟองสบู่การเมืองแตก ทักษิณหมดกึ๋น ชนชั้นกลางหมด...,ประชาไท -- 23 ส.ค. 47 ขาประจำทักษิณ ฟันธง ฟอง...,politics
4,กอต.เสนอเลิกถนนคลองลาน-อุ้มผาง,ประชาไท-23 ส.ค.47 คณะกรรมการอนุรักษ์ ผืนป่าตะ...,environment
...,...,...,...
35315,เกษียร เตชะพีระ : ทำไมประเทศกูถึงมี “ลัทธิชังช...,\n\n“พูดให้ถึงที่สุด คนที่ถูกกล่าวหาว่าเป็นพวก...,politics
35316,EU แจงพร้อมร่วม หากไทยชวนมาสังเกตการณ์เลือกตั้...,สหภาพยุโรปยันไม่ได้เตรียมการลงพื้นที่สังเกตการ...,politics
35317,ใบตองแห้ง: Elite เสื่อมหนีตรวจสอบ,ดูเหมือนผมจะเป็นเสียงข้างน้อยในข้างน้อย ที่เห็...,politics
35318,ใบตองแห้ง: ตู่หน้าบางสังคมโง่,ลุงตู่ของคนชั้นกลางในเมืองเนี่ย เป็นคนมีเสน่ห์...,politics


# 3. Clean Data

## 3.1. Handle Missing Values

### 3.1.1. Check Missing Values

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35320 entries, 0 to 35319
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      35320 non-null  object
 1   body_text  35318 non-null  object
 2   labels     35320 non-null  object
dtypes: object(3)
memory usage: 827.9+ KB


### 3.1.2. Remove Missing Values

In [4]:
data.dropna(axis=0, inplace=True)

## ### 3.2. Handle Outliers

# 4. Split Dataset into Training & Test Sets

In [5]:
target_name = "labels"
feature_name = list(data.columns.drop(target_name))

In [6]:
X = data[feature_name]
y = data[target_name]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

In [8]:
feature_name

['title', 'body_text']

# 5. Data Preparation

## 5.1. Count Vectorization

In [9]:
# !pip install pythainlp

In [10]:
# tokenize thai word
from pythainlp.tokenize import syllable_tokenize

### 5.1.1. title

#### 5.1.1.1. Training Set

In [11]:
corpus_train = X_train["title"].tolist()
title_vectorizer = CountVectorizer(max_features=1000, tokenizer=syllable_tokenize)
title_vectorizer.fit(corpus_train)
title_cnt_vec_train = title_vectorizer.transform(corpus_train).toarray()

In [12]:
title_cnt_vec_feature_name = [
    "cnt_title_" + feature for feature in title_vectorizer.get_feature_names()
]

In [13]:
X_train[title_cnt_vec_feature_name] = title_cnt_vec_train
X_train.drop("title", axis=1, inplace=True)

#### 5.1.1.2. Test Set

In [14]:
corpus_test = X_test["title"].tolist()
title_cnt_vec_test = title_vectorizer.transform(corpus_test).toarray()

In [15]:
X_test[title_cnt_vec_feature_name] = title_cnt_vec_test
X_test.drop("title", axis=1, inplace=True)

### 5.1.2. text

#### 5.1.2.1. Training Set

In [None]:
corpus_train = X_train["body_text"].tolist()
text_vectorizer = CountVectorizer(max_features=1000, tokenizer=syllable_tokenize)
text_vectorizer.fit(corpus_train)
text_cnt_vec_train = text_vectorizer.transform(corpus_train).toarray()

In [None]:
text_cnt_vec_feature_name = [
    "cnt_text_" + feature for feature in text_vectorizer.get_feature_names()
]

In [None]:
X_train[text_cnt_vec_feature_name] = text_cnt_vec_train
X_train.drop("body_text", axis=1, inplace=True)

#### 5.1.2.2. Test Set

In [None]:
corpus_test = X_test["body_text"].tolist()
text_cnt_vec_test = text_vectorizer.transform(corpus_test).toarray()

In [None]:
X_test[text_cnt_vec_feature_name] = text_cnt_vec_test
X_test.drop("body_text", axis=1, inplace=True)

## ===== Ordinal Encoding & One Hot Encoding =====

In [None]:
numerical_feature = []
categorical_feature = []
numerical_feature.extend(title_cnt_vec_feature_name)
numerical_feature.extend(text_cnt_vec_feature_name)

In [None]:
for feature in categorical_feature:
    print(feature, ":", np.unique(X_train[feature]))

In [None]:
ordinal_feature = []
nominal_feature = list(categorical_feature)

## ## 5.2. Ordinal Encoding

## 5.3. One Hot Encoding

### 5.3.1. Training Set

In [None]:
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
one_hot_encoder.fit(X_train[nominal_feature])

In [None]:
one_hot_feature = []
for i, feature in enumerate(nominal_feature):
    for cate in one_hot_encoder.categories_[i]:
        one_hot_feature_name = str(feature) + "_" + str(cate)
        one_hot_feature.append(one_hot_feature_name)

In [None]:
X_train[one_hot_feature] = one_hot_encoder.transform(X_train[nominal_feature])
X_train.drop(nominal_feature, axis=1, inplace=True)

### 5.3.2. Test Set

In [None]:
X_test[one_hot_feature] = one_hot_encoder.transform(X_test[nominal_feature])
X_test.drop(nominal_feature, axis=1, inplace=True)

## 5.4. Feature Scaling

### 5.4.1. Training Set

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

### 5.4.2. Test Set

In [None]:
X_test_scaled = scaler.transform(X_test)

# 6. Model Creation

## 6.1. Setting Parameters

In [None]:
clf = LogisticRegression(penalty="none")

## 6.2. Train Model

In [None]:
clf.fit(X_train_scaled, y_train)

## 6.3. Model's Weight & Bias

In [None]:
clf.coef_

In [None]:
clf.intercept_

# 7. Prediction

## 7.1. Training Set

In [None]:
y_pred_train = clf.predict(X_train_scaled)

## 7.2. Test Set

In [None]:
y_pred_test = clf.predict(X_test_scaled)

# 8. Model Evaluation

## 8.1. Training Set

### 8.1.1. Confusion Matrix

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
plot_confusion_matrix(clf, X_train_scaled, y_train, ax=ax)
plt.xticks(rotation=90)
plt.show()

### 8.1.2. Scoring

In [None]:
report = classification_report(y_train, y_pred_train, output_dict=True)

In [None]:
print("accuracy =", report["accuracy"])

In [None]:
pd.DataFrame.from_dict(report).T

## 8.2. Test Set

### 8.2.1. Confusion Matrix

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
plot_confusion_matrix(clf, X_test_scaled, y_test, ax=ax)
plt.xticks(rotation=90)
plt.show()

### 8.2.2. Scoring

In [None]:
report = classification_report(y_test, y_pred_test, output_dict=True)

In [None]:
print("accuracy =", report["accuracy"])

In [None]:
pd.DataFrame.from_dict(report).T

# 9. Save Model

In [None]:
import pickle

In [None]:
pickle.dump(
    (
        clf,
        title_vectorizer,
        text_vectorizer,
        one_hot_encoder,
        scaler,
        feature_name,
        numerical_feature,
        ordinal_feature,
        nominal_feature,
    ),
    open("news_categorization_model.pickle", "wb"),
)