# 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import (
    OrdinalEncoder, 
    OneHotEncoder, 
    StandardScaler, 
    MinMaxScaler
)
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

np.random.seed(12345)

# 2. Read Data

In [None]:
fake_news_data = pd.read_csv("dataset/Fake.csv")
true_news_data = pd.read_csv("dataset/True.csv")

In [None]:
fake_news_data["class"] = "fake"
true_news_data["class"] = "true"

In [None]:
data = pd.concat([fake_news_data, true_news_data], axis=0)
data.drop(["date"], axis=1, inplace=True)

data

# 3. Clean Data

## 3.1. Handle Missing Values

### 3.1.1. Check Missing Values

In [None]:
data.info()

### 3.1.2. Remove Missing Values (Listwise Deletion)

In [None]:
data.dropna(axis=0, inplace=True)

## 3.2. Handle Outliers

### 3.2.1. Check Outliers

In [None]:
data.describe()

### 3.2.2. Remove Outliers

In [None]:
# no outlier to remove

# 4. Split Dataset into Training & Test Sets

In [None]:
target_name = "class"
feature_name = list(data.columns.drop(target_name))

In [None]:
X = data[feature_name]
y = data[target_name]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

# 5. Data Preparation

## 5.1. Count Vectorization

### 5.1.1. title

#### 5.1.1.1. Training Set

In [None]:
corpus_train = X_train["title"].tolist()
title_vectorizer = CountVectorizer(max_features=1000)
title_vectorizer.fit(corpus_train)
title_cnt_vec_train = title_vectorizer.transform(corpus_train).toarray()

In [None]:
title_cnt_vec_feature_name = [
    "cnt_title_" + feature for feature in title_vectorizer.get_feature_names()
]

In [None]:
X_train[title_cnt_vec_feature_name] = title_cnt_vec_train
X_train.drop("title", axis=1, inplace=True)

#### 5.1.1.2. Test Set

In [None]:
corpus_test = X_test["title"].tolist()
title_cnt_vec_test = title_vectorizer.transform(corpus_test).toarray()

In [None]:
X_test[title_cnt_vec_feature_name] = title_cnt_vec_test
X_test.drop("title", axis=1, inplace=True)

### 5.1.2. text

#### 5.1.2.1. Training Set

In [None]:
corpus_train = X_train["text"].tolist()
text_vectorizer = CountVectorizer(max_features=1000)
text_vectorizer.fit(corpus_train)
text_cnt_vec_train = text_vectorizer.transform(corpus_train).toarray()

In [None]:
text_cnt_vec_feature_name = [
    "cnt_text_" + feature for feature in text_vectorizer.get_feature_names()
]

In [None]:
X_train[text_cnt_vec_feature_name] = text_cnt_vec_train
X_train.drop("text", axis=1, inplace=True)

#### 5.1.2.2. Test Set

In [None]:
corpus_test = X_test["text"].tolist()
text_cnt_vec_test = text_vectorizer.transform(corpus_test).toarray()

In [None]:
X_test[text_cnt_vec_feature_name] = text_cnt_vec_test
X_test.drop("text", axis=1, inplace=True)

## ===== Ordinal Encoding & One Hot Encoding =====

In [None]:
numerical_feature = []
categorical_feature = ["subject"]
numerical_feature.extend(title_cnt_vec_feature_name)
numerical_feature.extend(text_cnt_vec_feature_name)

In [None]:
for feature in categorical_feature:
    print(feature, ":", np.unique(X_train[feature]))

In [None]:
ordinal_feature = []
nominal_feature = list(categorical_feature)

## ## 5.2. Ordinal Encoding

## 5.3. One Hot Encoding

### 5.3.1. Training Set

In [None]:
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
one_hot_encoder.fit(X_train[nominal_feature])

In [None]:
one_hot_feature = []
for i, feature in enumerate(nominal_feature):
    for cate in one_hot_encoder.categories_[i]:
        one_hot_feature_name = str(feature) + "_" + str(cate)
        one_hot_feature.append(one_hot_feature_name)

In [None]:
X_train[one_hot_feature] = one_hot_encoder.transform(X_train[nominal_feature])
X_train.drop(nominal_feature, axis=1, inplace=True)

### 5.3.2. Test Set

In [None]:
X_test[one_hot_feature] = one_hot_encoder.transform(X_test[nominal_feature])
X_test.drop(nominal_feature, axis=1, inplace=True)

## 5.4. Feature Scaling

### 5.4.1. Training Set

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

### 5.4.2. Test Set

In [None]:
X_test_scaled = scaler.transform(X_test)

# 6. Model Creation

## 6.1. Setting Parameters

In [None]:
clf = SVC()

## 6.2. Train Model

In [None]:
clf.fit(X_train_scaled, y_train)

## 6.3. Model's Weight & Bias

In [None]:
clf.dual_coef_

In [None]:
clf.intercept_

# 7. Prediction

## 7.1. Training Set

In [None]:
y_pred_train = clf.predict(X_train_scaled)

## 7.2. Test Set

In [None]:
y_pred_test = clf.predict(X_test_scaled)

# 8. Model Evaluation

## 8.1. Training Set

### 8.1.1. Confusion Matrix

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
plot_confusion_matrix(clf, X_train_scaled, y_train, ax=ax)
plt.xticks(rotation=90)
plt.show()

### 8.1.2. Scoring

In [None]:
report = classification_report(y_train, y_pred_train, output_dict=True)

In [None]:
print('accuracy =', report['accuracy'])

In [None]:
pd.DataFrame.from_dict(report).T

## 8.2. Test Set

### 8.2.1. Confusion Matrix

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
plot_confusion_matrix(clf, X_test_scaled, y_test, ax=ax)
plt.xticks(rotation=90)
plt.show()

### 8.2.2. Scoring

In [None]:
report = classification_report(y_test, y_pred_test, output_dict=True)

In [None]:
print('accuracy =', report['accuracy'])

In [None]:
pd.DataFrame.from_dict(report).T

# 9. Save Model

In [None]:
import pickle

In [None]:
pickle.dump((clf, 
             title_vectorizer, 
             text_vectorizer, 
             one_hot_encoder, 
             scaler, 
             feature_name, 
             numerical_feature, 
             ordinal_feature, 
             nominal_feature), 
             open('support_vector_classification_model.pickle', 'wb'))