## Imports

In [None]:
import sys
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud

In [None]:
%matplotlib inline

In [None]:
plt.style.use("ggplot")
plt.rcParams['figure.figsize'] = (12, 8)

## NLTK Downloads

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('wordnet')

## Warning filter

In [None]:
import warnings

In [None]:
warnings.filterwarnings('ignore')

## SQL Queries

In [None]:
# X_train
X_train_query: str = '''
SELECT 
    "speaker"                as "speaker",
    "statement"              as "headline",
    "fullText_based_content" as "body"
FROM "fnn_train";
'''.strip()

In [None]:
# X_test
X_test_query: str = '''
SELECT 
    "speaker"                as "speaker",
    "statement"              as "headline",
    "fullText_based_content" as "body"
FROM "fnn_test";
'''.strip()

In [None]:
# Y_train
Y_train_query: str = '''
SELECT 
    "label_fnn" as "target" 
FROM "fnn_train";
'''.strip()

In [None]:
# Y_test
Y_test_query: str = '''
SELECT 
    "label_fnn" as "target" 
FROM "fnn_test";
'''.strip()

## Reading Database

In [None]:
import sqlite3

In [None]:
conn = sqlite3.connect("data.sqlite3")

In [None]:
X_train = pd.read_sql(X_train_query, con=conn)
X_test = pd.read_sql(X_test_query, con=conn)
y_train = pd.read_sql(Y_train_query, con=conn)
y_test = pd.read_sql(Y_test_query, con=conn)

In [None]:
conn.close()

## Dropping Columns

In [None]:
X_train.drop(
    [
        "speaker", 
        "headline"
    ], 
    axis=1, 
    inplace=True)

In [None]:
X_test.drop(
    [
        "speaker", 
        "headline"
    ], 
    axis=1, 
    inplace=True
)

## Label Encode of the Categories

In [None]:
y_train['target'] = y_train['target'].astype('category')
y_test['target'] = y_test['target'].astype('category')

In [None]:
y_train['target'] = y_train['target'].str.replace('real', '0')
y_train['target'] = y_train['target'].str.replace('fake', '1')

In [None]:
y_test['target'] = y_test['target'].str.replace('real', '0')
y_test['target'] = y_test['target'].str.replace('fake', '1')

## Converting to Lowecase and Removing Punctuations

In [None]:
X_train['body'] = X_train['body'].str.lower().str.replace('[^\w\s]', '')

In [None]:
X_test['body'] = X_test['body'].str.lower().str.replace('[^\w\s]', '')

## Feature Engineering (Stopwords, Lemmatization)

In [None]:
from tqdm import tqdm_notebook as tqdm
from nltk import corpus
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer

In [None]:
total_row: int = X_train.shape[0] + X_test.shape[0]
progress_bar = tqdm(total=total_row)

In [None]:
def Feature_Engineering(txt: str, stopwords: bool, lemmatization: bool):
    global total_row, progress_bar
    
    # Tokenizing
    tokenizer = WhitespaceTokenizer()
    txt = tokenizer.tokenize(txt.strip())
    
    # Stopwards
    if stopwords:
        stopwords_lst = corpus.stopwords.words("english")
        txt = (word for word in txt if word not in stopwords_lst)
        
    # Lemmatization
    if lemmatization:
        lemmatizer = WordNetLemmatizer()
        txt = (lemmatizer.lemmatize(word) for word in txt)
    
    # Making sentence
    txt = ' '.join(txt)
    
    # Updating Progressbar
    progress_bar.update(n=1)
    
    return txt.strip()

In [None]:
def X_Train_Feature_Engineering():
    global X_train
    X_train['body'] = X_train['body'].apply(
        lambda text: Feature_Engineering(
            txt=text,
            stopwords=True,
            lemmatization=False
        )
    )

In [None]:
def X_Test_Feature_Engineering():
    global X_test
    X_test['body'] = X_test['body'].apply(
        lambda text: Feature_Engineering(
            txt = text,
            stopwords=True,
            lemmatization=False
        )
    )

In [None]:
# import threading

In [None]:
# X_Train_Feature_Engineering_Thread = threading.Thread(target=X_Train_Feature_Engineering)
# X_Test_Feature_Engineering_Thread = threading.Thread(target=X_Test_Feature_Engineering)

In [None]:
X_Train_Feature_Engineering()
X_Test_Feature_Engineering()

In [None]:
# X_Train_Feature_Engineering_Thread.start()
# X_Test_Feature_Engineering_Thread.start()

In [None]:
# X_Train_Feature_Engineering_Thread.join()
# X_Test_Feature_Engineering_Thread.join()

In [None]:
# del total_row, progress_bar, X_Train_Feature_Engineering_Thread, X_Test_Feature_Engineering_Thread
del total_row, progress_bar

## Extra usefull columns creating

### Train

In [None]:
X_train['word_count'] = X_train["body"].apply(
    lambda x: len(str(x).split())
)

In [None]:
X_train['char_count'] = X_train["body"].apply(
    lambda x: sum(len(word) for word in str(x).split())
)

In [None]:
# X_train['sentence_count'] = X_train["body"].apply(lambda x: len(str(x).split(".")))  # 1
X_train['avg_word_length'] = X_train['char_count'] / X_train['word_count']
X_train['avg_sentence_length'] = X_train['word_count'] # / X_train['sentence_count']

### Test

In [None]:
X_test['word_count'] = X_test["body"].apply(
    lambda x: len(str(x).split())
)

In [None]:
X_test['char_count'] = X_test["body"].apply(
    lambda x: sum(len(word) for word in str(x).split())
)

In [None]:
# X_test['sentence_count'] = X_test["body"].apply(lambda x: len(str(x).split(".")))  # 1
X_test['avg_word_length'] = X_test['char_count'] / X_test['word_count']
X_test['avg_sentence_length'] = X_test['word_count'] # / X_test['sentence_count']

## Keeping copy of the processed dataset

In [None]:
X_train_ORIG, y_train_ORIG = X_train.copy(), y_train.copy()
X_test_ORIG, y_test_ORIG = X_test.copy(), y_test.copy()

In [None]:
df_train = pd.concat(
    [
        X_train_ORIG, 
        y_train_ORIG
    ],
    axis=1, 
    ignore_index=False
)

In [None]:
df_test = pd.concat(
    [
        X_test_ORIG, 
        y_test_ORIG
    ],
    axis=1,
    ignore_index=False
)

In [None]:
with pd.ExcelWriter('processed_data.xlsx') as writer: 
    df_train.to_excel(
        excel_writer=writer, 
        sheet_name='Train', 
        header=True, 
        index=False
    )
    df_test.to_excel(
        excel_writer=writer, 
        sheet_name='Test', 
        header=True, 
        index=False
    )

## Plotting Count of Targets

In [None]:
plt.figure(figsize=(6, 4))
sns.set(
    style='whitegrid',
    color_codes=True,
)
sns.countplot(
    x='target',
    data=df_train,
    hue='target'
);

## Vectorizing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 4)
)

In [None]:
vectorizer.fit(X_train.body)

In [None]:
X_train = vectorizer.transform(X_train.body)

In [None]:
# X_test = vectorizer.transform(X_test.body)

## Keeping copy of vocabularies

In [None]:
vocabulary = pd.DataFrame(
    tuple(vectorizer.vocabulary_.items()), 
    columns = ['word', 'id']
)

In [None]:
vocabulary.to_excel("vocabulary_data.xlsx", header=True, index=False)

## Selecting Best Features

In [None]:
from sklearn import feature_selection

In [None]:
def Select_Best_Features(y: pd.core.series.Series, p_value_limit: int, feature_selector: str):
    global vectorizer

    if feature_selector.lower() == 'chi2':
        _, p = feature_selection.chi2(X_train, y)
    elif feature_selector.lower() == 'anova':
        _, p = feature_selection.f_classif(X_train, y)
    
    X_features = pd.DataFrame()
    X_features = X_features.append(
        pd.DataFrame(
            {
                'feature': vectorizer.get_feature_names(),
                'score': 1 - p,
            }
        )
    )
    
    return X_features[X_features['score'] > p_value_limit]

In [None]:
X_features = Select_Best_Features(
    y=y_train['target'],
    p_value_limit=0.95,
    feature_selector='anova'
)

In [None]:
X_features.to_excel("processed_vocabulary_data.xlsx", header=True, index=False)

## Final Selected Features

In [None]:
vectorizer = TfidfVectorizer(
    vocabulary=X_features["feature"].unique().tolist()
)

In [None]:
vectorizer.fit(X_train_ORIG.body)

In [None]:
X_train = vectorizer.transform(X_train_ORIG.body)

In [None]:
X_test = vectorizer.transform(X_test_ORIG.body)

## Keeping copy of vocabularies

In [None]:
vocabulary = pd.DataFrame(
    tuple(vectorizer.vocabulary_.items()), 
    columns = ['word', 'id']
)

In [None]:
vocabulary.to_excel("vocabulary_data_2.xlsx", header=True, index=False)

## Classifier

In [None]:
from xgboost import XGBClassifier
from sklearn import metrics, naive_bayes, pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
classifier = XGBClassifier(
    booster = 'gbtree',
    max_depth=5,
    n_estimators=5000,
    learning_rate=0.01,
    importance_type='gain',
    random_state= 0,
    n_jobs=8
)

In [None]:
classifier.fit(X_train, y_train_ORIG.target)

In [None]:
prediction = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=y_test_ORIG.target, y_pred=prediction)
print(cm)

In [None]:
from yellowbrick.classifier import confusion_matrix

plt.figure(figsize=(6, 5))
visualizer = confusion_matrix(
    classifier,
    X_train, y_train_ORIG.target, X_test, y_test_ORIG.target,
    classes=['Fake', 'Real'],
    cmap='PuBu',
    is_fitted=True
)
visualizer.show();

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: {}%".format(round(accuracy_score(y_test_ORIG, prediction) * 100, 2)))

In [None]:
print(
    classification_report(
        y_true=y_test_ORIG.target,
        y_pred=prediction,
        target_names=['Fake', 'Real'],
        zero_division='warn',
        digits=5
    )
)