#Install Library

In [None]:
!pip install pythainlp
!pip install emoji

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 999

#Load data

In [None]:
dataset_url = 'https://raw.githubusercontent.com/SpriteKitz/FakeNewsProject/c84aca1222c69a5bbd4b44f6a0a6d92d73903e1a/FackNewstAllPostDatases.csv'
df = pd.read_csv(dataset_url)

In [None]:
df.head(20)

#Text Prepocess

##Labels definition

In [None]:
import re
def LabelDefinition(row):
  if re.search(r'\#ข่าวปลอม', row.Content):
    result = 'ข่าวปลอม'
  elif re.search(r'\#ข่าวบิดเบือน', row.Content):
    result = 'ข่าวบิดเบือน'
  elif re.search(r'\#ข่าวจริง', row.Content):
    result = 'ข่าวจริง'
  else:
    result = 'ข่าวโพสทั่วไป'
  return result

In [None]:
df['Label'] = df.apply(LabelDefinition, axis=1)

In [None]:
df[['Content', 'Label']].head(20)

In [None]:
df.groupby('Label').size()

## Clean text

In [None]:
clean_df = df
clean_pattern = r'ข่าวปลอม.*(!|❌)|จริงหรือ?|ข่าวบิดเบือน |อ่าน(เพิ่ม|ต่อ).*|- ศูนย์ต่อต้านข่าวปลอม.*|#.*|\@.*|\n|\t'
clean_df['News'] = clean_df['Content'].str.replace(clean_pattern, '')
clean_df['News'] = clean_df['News'].str.strip()
collect_news = (clean_df['News'].str.len() > 20)
clean_df = clean_df.loc[collect_news]
clean_df[['Content', 'News', 'Label']].head(100)

#Build Model

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.pipeline import Pipeline
from pythainlp import word_tokenize
from pythainlp.ulmfit import process_thai

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OutputCodeClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix, plot_confusion_matrix


In [None]:
train_test_df = clean_df['News']
label = clean_df['Label']
X_train, X_test, y_train, y_test = train_test_split(train_test_df,label, test_size = 0.2)

In [None]:
print(process_thai('ยาแอสไพริน (Aspirin) ช่วยป้องกันโรคหลอดเลือดหัวใจได้ แต่ต้องปรึกษาแพทย์ก่อนใช้ยา '))

TfidfVectorizer Ref: [Click link](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [None]:
# LinearSVC
lsvc = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=process_thai, ngram_range=(1,2))),
    ('SVC', SVC(kernel='linear',probability=True)),
])
# SGDClassifier
sgd = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=process_thai, ngram_range=(1,2))),
    ('SGDClassifier', OneVsRestClassifier(SGDClassifier())),
])
# LogisticRegression
lr = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=process_thai, ngram_range=(1,2))),
    ('LogisticRegression', OneVsRestClassifier(LogisticRegression(solver='lbfgs'))),
])
# RandomForestClassifier
rf = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=process_thai, ngram_range=(1,2))),
    ('RandomForestClassifier', OneVsRestClassifier(RandomForestClassifier(max_depth=100,max_leaf_nodes=100,max_features=100, random_state=1))),
])

In [None]:
text_clf = {}
for classifier in [lsvc, sgd, lr, rf]:
    clf = classifier.fit(X_train, y_train)
    pred_label = clf.predict(X_test)
    text_clf[classifier.steps[1][0]] = clf
    print(classifier.steps[1][0])
    print('Accuracy score : ', round(accuracy_score(pred_label,y_test)*100.0,2),"%")
    print("==================================\n")

##Tunning Model

In [None]:
# # LinearSVC
# pipeline  = Pipeline([
#     ('tfidf', TfidfVectorizer(tokenizer=process_thai)),
#     ('SVC', OneVsRestClassifier(SVC(kernel='linear',probability=True)))
# ])

# parameters = {
#               'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
#               'tfidf__use_idf': (True, False),
#               'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
#               'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
#               'tfidf__smooth_idf': (True, False),
#               'tfidf__norm': ('l1', 'l2', None),
#               }

# grid = GridSearchCV(pipeline, parameters, cv=3, verbose=1)#cv = k folds cross vald
# grid.fit(X_train, y_train)
# fake_news_classify = grid.best_estimator_

#Evaluate model

In [None]:
text_clf['SVC']

In [None]:
from sklearn.metrics import classification_report
# Set model for predict
fake_news_classify= text_clf['SVC']

pred_label = fake_news_classify.predict(X_test)
y_true = y_test.to_numpy()
y_pred = pred_label
target_names = ['ข่าวจริง', 'ข่าวโพสทั่วไป', 'ข่าวบิดเบือน', 'ข่าวปลอม']
print(classification_report(y_true, y_pred, target_names=target_names,digits=2))

#Test

In [None]:
!pip install -q gradio

In [None]:
import gradio as gr
import os
def myApp(text):
  labels = ['ข่าวจริง', 'ข่าวโพสทั่วไป', 'ข่าวบิดเบือน', 'ข่าวปลอม']
  prediction = fake_news_classify.predict_proba([text])[0]
  return {labels[i]: float(prediction[i]) for i in range(4)}

iface =gr.Interface(myApp,
                    gr.inputs.Textbox(label='News',placeholder="Enter News Text...",lines=9),
                    gr.outputs.Label(num_top_classes=4),
                              live=True,
                              interpretation="default",
                              capture_session=True,)
iface.test_launch()

if __name__ == "__main__":
    iface.launch()