In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import re
import nltk
import string
import spacy
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from numpy.random import seed
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from xgboost import XGBClassifier

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_columns', None)

In [None]:
!python -m spacy download en_vectors_web_lg
!python -m spacy link en_vectors_web_lg en_vectors_web_lg_link

In [None]:
nlp = spacy.load('en_vectors_web_lg_link')

In [None]:
true_data = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
true_data['class'] = 'True'

fake_data = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
fake_data['class'] = 'Fake'

df = true_data.append(fake_data).sample(frac = 1, random_state = 1)
df.index = range(len(true_data) + len(fake_data))
df

# Text preparation

In [None]:
def del_punct(text):
    chars = []
    for char in text:
        if char not in string.punctuation:
            chars.append(char)
        else:
            chars.append(' ')
    return ''.join(chars)

def text_preparation(text: str) -> str:
    text = re.sub(r'^\w+ \(\w+\) - ', '', text)
    text = text.lower()
    text = del_punct(text)
    doc = nlp(text)
    text = ' '.join([
            token.lemma_ 
                for token in doc 
                if token.text not in nlp.Defaults.stop_words 
        ])
    
    text = re.sub(r'\d+', ' somenumbers ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text
     
def processing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['class_'] = df['class'].apply(lambda x: {'Fake': 1, 'True':0}[x])
    df['len title'] = df['title'].apply(len)
    df['len text'] = df['text'].apply(len)
    df['text cleaned'] = df['text'].apply(text_preparation)
    df['title cleaned'] = df['title'].apply(text_preparation)
    df['cleaned all'] = df['text cleaned'] + ' ' + df['title cleaned']
    return df

df_d = processing(df)
df_d.head()

# Bag of words

In [None]:
bow_transformer = CountVectorizer(max_features = 1500).fit(df_d['cleaned all'])
bow_df = bow_transformer.transform(df_d['cleaned all'])
bow_df.shape

# EDA

In [None]:
def print_article(line):
    print(
        f'\033[1m{line["title"]} \033[0m', 
        f"tag: {line['subject']}", 
        f"class: {line['class']}", 
        line['text'], 
        sep = '\n\n',
        end = '\n\n\n'
    )
    

print_article(df.iloc[0])
print_article(df.iloc[1])

In [None]:
pd.DataFrame(df['class'].value_counts())

In [None]:
pd.DataFrame(df[['subject', 'class']].value_counts(), columns = ['#'])

#### Length of title and text

In [None]:
g = sns.FacetGrid(df_d, hue='class', height = 7, aspect = 2)
g.map(sns.kdeplot, 'len title')
plt.title('Length of title distribution')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df_d, hue='class', height = 7, aspect = 2)
g.map(sns.kdeplot, 'len text')
plt.title('Length of text distribution')
plt.legend()
plt.show()

Fake news tend to have longer title, but there is not much difference in text lenght

#### Name of media

In [None]:
def name_of_media(text):
    text = re.findall(r'(?<= \()\w+(?=\) - )', text[:100])
    if text:
        return text[0]
    else:
        return 'Not provided'

name_of_media(df['text'][0])

In [None]:
df['Name of media'] = df['text'].apply(name_of_media)

In [None]:
pd.DataFrame(df[['Name of media', 'class']].value_counts(), columns = ['#'])

#### News topics discovery through wordcloud

In [None]:
def text_for_cloud(label):
    text = ' '.join(df_d['text'][df_d['class'] == label].to_list())
    text = text.replace('somenumbers', '')
    return text

for label in ['True', 'Fake']:
    wordcloud = WordCloud(
        max_font_size=500,
        max_words=1000,
        background_color="white",
    ).generate(text_for_cloud(label))

    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'{label} news wordcloud', fontsize=20)
    plt.show()

#### News topics discovery through Latent Dirichlet Allocation

We'll split news dataset to 12 topics. Number was chosen randomly.

In [None]:
LDA = LatentDirichletAllocation(
    n_components = 12,
    random_state = 1,
    n_jobs = -1
)
LDA.fit(bow_df)

In [None]:
for index,topic in enumerate(LDA.components_):
    print(f'The top 10 words for topic #{index}')
    print([bow_transformer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Groups summary (my subjective assessment):

In [None]:
groups_summary = {
    0:'~ E-mail scandal',
    1:'Federal Government',
    2:'Election',
    3:'International Security',
    4:'EU/UK related',
    5:'Clinton vs. Trump election',
    6:'US President administration',
    7:'Law',
    8:'-',
    9:'US Domestic policy',
    10:'~ Finance',
    11:'Сrime',
}

In [None]:
topic_results = LDA.transform(bow_df)

In [None]:
df_d['Topic N'] = topic_results.argmax(axis=1)

In [None]:
df_d['Topic'] = df_d['Topic N'].apply(lambda x: groups_summary[x])

In [None]:
n = 224
print(f"Topic: {df_d['Topic'][n]}", end = '\n\n')
print_article(df.iloc[n])

In [None]:
pd.DataFrame(
    df_d[['Topic', 'class']].value_counts(),
    columns = ['#']
).sort_values(by=['Topic'])

# Sentiment Analysis

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
sid_df_title = pd.DataFrame([sid.polarity_scores(article) for article in df['title']])
sid_df_text = pd.DataFrame([sid.polarity_scores(article) for article in df['text']])

In [None]:
sid_df_title['class'] = df['class']
sid_df_text['class'] = df['class']
sid_df_title.head()

In [None]:
g = sns.FacetGrid(sid_df_title, hue='class', height = 7, aspect = 2)
g.map(sns.kdeplot, 'compound')
plt.title('Distribution of sentiment score for title')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(sid_df_text, hue='class', height = 7, aspect = 2)
g.map(sns.kdeplot, 'compound')
plt.title('Distribution of sentiment score for text')
plt.legend()
plt.show()

We can see here that fake news tend to have more radical sentiment score values, especially in title. It means that fake news titles tend to have more strong emotional coloring. Also fake news text more often has negative emotion coloring.

# Modeling

#### Splits

In [None]:
df_else, df_validation  = train_test_split(df_d,
                                test_size=0.25,
                                random_state = 101)

In [None]:
df_train, df_test  = train_test_split(df_else,
                                test_size=0.25,
                                random_state = 101)

In [None]:
datasets = {
    'train'        : df_train, 
    'test'         : df_test, 
    'train + test' : df_else, 
    'validation'   : df_validation
}

for dataset_name, dataset in datasets.items():
    print('\n' + dataset_name + ':')
    display(pd.DataFrame(dataset['class'].value_counts()))

#### Bag of words

In [None]:
bow_transformer = CountVectorizer(max_features = 1500).fit(df_train['cleaned all'])
bow_train = bow_transformer.transform(df_train['cleaned all'])
bow_train.shape

In [None]:
tfidf_transformer = TfidfTransformer().fit(bow_train)
train_tfidf = tfidf_transformer.transform(bow_train)

bow_test = bow_transformer.transform(df_test['cleaned all'])
test_tfidf = tfidf_transformer.transform(bow_test)

In [None]:
X_train_tfidf = pd.DataFrame.sparse.from_spmatrix(train_tfidf)
X_train_tfidf.columns = bow_transformer.get_feature_names()
X_test_tfidf = pd.DataFrame.sparse.from_spmatrix(test_tfidf)
X_test_tfidf.columns = bow_transformer.get_feature_names()

y_train = df_train['class_']
y_test = df_test['class_']

In [None]:
X_train_tfidf.iloc[1:5,1:20]

#### Modeling on BoW

In [None]:
def feature_importance(model, X):
    features = pd.DataFrame({
                'Variable'  : X.columns,
                'Importance': model.feature_importances_
            })
    features.sort_values('Importance', ascending=False, inplace=True)
    display(features.head(20))
    

def eval_result(model, X_test, y_test, show_feature_imp = True):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pred = model.predict(X_test)
        print(classification_report(y_test, pred, target_names = ['True', 'Fake']))
        display(pd.DataFrame(confusion_matrix(y_test, pred), 
                         columns = ['Predicted True', 'Predicted Fake'],
                         index = ['True', 'Fake']))
        print(f'Accuracy: {round(accuracy_score(y_test, pred), 5)}')
        
        if hasattr(model, 'feature_importances_') and show_feature_imp:
            feature_importance(model, X_test)

In [None]:
lgmodel = LogisticRegression(
    solver='lbfgs', 
    n_jobs = -1,
    random_state = 101
)
lgmodel.fit(X_train_tfidf, y_train)
eval_result(lgmodel, X_test_tfidf, y_test)

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
eval_result(nb_model, X_test_tfidf, y_test)

In [None]:
dtc = DecisionTreeClassifier(random_state = 1)
dtc.fit(X_train_tfidf, y_train)
eval_result(dtc, X_test_tfidf, y_test)

In [None]:
rfc = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc.fit(X_train_tfidf, y_train)
eval_result(rfc, X_test_tfidf, y_test)

In [None]:
xgbr = XGBClassifier(
    random_state = 1,
    n_jobs = -1,
    eval_metric = 'logloss',
    use_label_encoder = False
)
xgbr.fit(X_train_tfidf, y_train)
eval_result(xgbr, X_test_tfidf, y_test)

#### Modeling with Sentiment data

In [None]:
sid = SentimentIntensityAnalyzer()

sid_df_train_title = pd.DataFrame([sid.polarity_scores(article) for article in df_train['title']])
sid_df_train_title.columns = ['sent: title_' + col for col in sid_df_train_title.columns]
sid_df_train_text = pd.DataFrame([sid.polarity_scores(article) for article in df_train['text']])
sid_df_train_text.columns = ['sent: text_' + col for col in sid_df_train_text.columns]

sid_df_test_title = pd.DataFrame([sid.polarity_scores(article) for article in df_test['title']])
sid_df_test_title.columns = ['sent: title_' + col for col in sid_df_test_title.columns]
sid_df_test_text = pd.DataFrame([sid.polarity_scores(article) for article in df_test['text']])
sid_df_test_text.columns = ['sent: text_' + col for col in sid_df_test_text.columns]

In [None]:
X_train_sid = pd.concat([sid_df_train_title, sid_df_train_text], axis = 1)
X_test_sid = pd.concat([sid_df_test_title, sid_df_test_text], axis = 1)

In [None]:
X_train_sid.head()

In [None]:
lgmodel_sid = LogisticRegression(
    solver='lbfgs', 
    n_jobs = -1,
    random_state = 101
)
lgmodel_sid.fit(X_train_sid, y_train)
eval_result(lgmodel_sid, X_test_sid, y_test)

In [None]:
dtc_sid = DecisionTreeClassifier(random_state = 1)
dtc_sid.fit(X_train_sid, y_train)
eval_result(dtc_sid, X_test_sid, y_test)

In [None]:
rfc_sid = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc_sid.fit(X_train_sid, y_train)
eval_result(rfc_sid, X_test_sid, y_test)

In [None]:
xgbr_sid = XGBClassifier(
    random_state = 1,
    n_jobs = -1,
    eval_metric = 'logloss',
    use_label_encoder = False
)
xgbr_sid.fit(X_train_sid, y_train)
eval_result(xgbr_sid, X_test_sid, y_test)

#### Modeling with Topic modeling data

In [None]:
LDA = LatentDirichletAllocation(n_components=12,random_state=1)
LDA.fit(bow_train)

In [None]:
for index,topic in enumerate(LDA.components_):
    print(f'The top 10 words for topic #{index}')
    print([bow_transformer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Groups summary (my subjective assessment):

In [None]:
groups_summary2 = {
    0:'~ BLM protests',
    1:'Clinton vs. Trump election',
    2:'Migration Policy',
    3:'~ family related',
    4:'Law',
    5:'~ E-mail scandal',
    6:'US Election',
    7:'Middle East wars and Terrorism',
    8:'US Presidents related',
    9:'US Domestic policy',
    10:'EU Domestic policy',
    11:'Far East security',
}

In [None]:
X_train_topics = pd.DataFrame(LDA.transform(bow_train))
X_test_topics = pd.DataFrame(LDA.transform(bow_test))
print(f'train shape: {X_train_topics.shape}, test shape: {X_test_topics.shape}')

In [None]:
X_train_topics.columns = ['topic: ' + groups_summary2[x] for x in X_train_topics.columns]
X_test_topics.columns = ['topic: ' + groups_summary2[x] for x in X_test_topics.columns]

In [None]:
X_train_topics.head()

In [None]:
lgmodel_topics = LogisticRegression(
    solver='lbfgs', 
    n_jobs = -1,
    random_state = 101
)
lgmodel_topics.fit(X_train_topics, y_train)
eval_result(lgmodel_topics, X_test_topics, y_test)

In [None]:
dtc_topics = DecisionTreeClassifier(random_state = 1)
dtc_topics.fit(X_train_topics, y_train)
eval_result(dtc_topics, X_test_topics, y_test)

In [None]:
rfc_topics = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc_topics.fit(X_train_topics, y_train)
eval_result(rfc_topics, X_test_topics, y_test)

In [None]:
xgbr_topics = XGBClassifier(
    random_state = 1,
    n_jobs = -1,
    eval_metric = 'logloss',
    use_label_encoder = False
)
xgbr_topics.fit(X_train_topics, y_train)
eval_result(xgbr_topics, X_test_topics, y_test)

#### Combined model: BoW + Sentiment data + Topics data (hierarchical)

In [None]:
class CombinedModel:
    
    def __init__(self, bow_model, sent_model, topic_model, gen_model):
        self.bow_model = bow_model
        self.sent_model = sent_model
        self.topic_model = topic_model
        self.gen_model = gen_model
    
    def __generate_data_for_gen_model(self, X_in):
        X_tfidf, X_sid, X_topics, X_len = X_in
        
        bow_model_pred = pd.Series(self.bow_model.predict(X_tfidf), name =  'bow')
        sent_model_pred = pd.Series(self.sent_model.predict(X_sid), name = 'sent')
        topic_model_pred = pd.Series(self.topic_model.predict(X_topics), name = 'topic')
              
        X_general = pd.concat([
            bow_model_pred, 
            sent_model_pred, 
            topic_model_pred, 
            X_len
        ], axis = 1)
        
        return X_general
        
    def fit(self, X_train, y_train):
        self.X_train_general = self.__generate_data_for_gen_model(X_train)
        self.gen_model.fit(self.X_train_general, y_train)
        if hasattr(self.gen_model, 'feature_importances_'):
            self.feature_importances_ = self.gen_model.feature_importances_
            
    def predict(self, X_test):
        X_test_general = self.__generate_data_for_gen_model(X_test)
        return self.gen_model.predict(X_test_general)

In [None]:
X_train_len = df_train[['len title', 'len text']]
X_train_len.reset_index(inplace = True, drop = True)
X_test_len = df_test[['len title', 'len text']]
X_test_len.reset_index(inplace = True, drop = True)

In [None]:
lgmodel_gen = LogisticRegression(
    solver='lbfgs', 
    n_jobs = -1,
    random_state = 101
)

comb_lg = CombinedModel(
    bow_model = xgbr,
    sent_model = rfc_sid,
    topic_model = rfc_topics,
    gen_model = lgmodel_gen
)

comb_lg.fit(
    X_train = (X_train_tfidf, X_train_sid, X_train_topics, X_train_len),
    y_train = y_train
)

eval_result(
    model = comb_lg, 
    X_test = (X_test_tfidf, X_test_sid, X_test_topics,  X_test_len), 
    y_test = y_test
)

In [None]:
dtc_gen = DecisionTreeClassifier(random_state = 1)

comb_dtc = CombinedModel(
    bow_model = xgbr,
    sent_model = rfc_sid,
    topic_model = rfc_topics,
    gen_model = dtc_gen
)

comb_dtc.fit(
    X_train = (X_train_tfidf, X_train_sid, X_train_topics, X_train_len),
    y_train = y_train
)

eval_result(
    model = comb_dtc, 
    X_test = (X_test_tfidf, X_test_sid, X_test_topics,  X_test_len), 
    y_test = y_test,
    show_feature_imp = False
)

feature_importance(model = comb_dtc, X = comb_dtc.X_train_general)

In [None]:
rfc_gen = RandomForestClassifier(n_jobs = -1, random_state = 1)

comb_rfc = CombinedModel(
    bow_model = xgbr,
    sent_model = rfc_sid,
    topic_model = rfc_topics,
    gen_model = rfc_gen
)

comb_rfc.fit(
    X_train = (X_train_tfidf, X_train_sid, X_train_topics, X_train_len),
    y_train = y_train
)

eval_result(
    model = comb_rfc, 
    X_test = (X_test_tfidf, X_test_sid, X_test_topics,  X_test_len), 
    y_test = y_test,
    show_feature_imp = False
)

feature_importance(model = comb_rfc, X = comb_rfc.X_train_general)

In [None]:
xgbr_gen = XGBClassifier(
    random_state = 1,
    n_jobs = -1,
    eval_metric = 'logloss',
    use_label_encoder = False
)

comb_xgbr = CombinedModel(
    bow_model = xgbr,
    sent_model = rfc_sid,
    topic_model = rfc_topics,
    gen_model = xgbr_gen
)

comb_xgbr.fit(
    X_train = (X_train_tfidf, X_train_sid, X_train_topics, X_train_len),
    y_train = y_train
)

eval_result(
    model = comb_xgbr, 
    X_test = (X_test_tfidf, X_test_sid, X_test_topics,  X_test_len), 
    y_test = y_test,
    show_feature_imp = False
)

feature_importance(model = comb_xgbr, X = comb_xgbr.X_train_general)

#### Combined model: BoW + Sentiment data + Topics data (flat)

In [None]:
X_train_full = pd.concat([X_train_tfidf, X_train_sid, X_train_topics, X_train_len], axis = 1)
X_test_full = pd.concat([X_test_tfidf, X_test_sid, X_test_topics,  X_test_len], axis = 1)

In [None]:
X_train_full.iloc[:5, -30:]

In [None]:
lgmodel_comb2 = LogisticRegression(
    solver='lbfgs', 
    n_jobs = -1,
    random_state = 101
)
lgmodel_comb2.fit(X_train_full, y_train)
eval_result(lgmodel_comb2, X_test_full, y_test)

In [None]:
dtc_comb2 = DecisionTreeClassifier(random_state = 1)
dtc_comb2.fit(X_train_full, y_train)
eval_result(dtc_comb2, X_test_full, y_test)

In [None]:
rfc_comb2 = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc_comb2.fit(X_train_full, y_train)
eval_result(rfc_comb2, X_test_full, y_test)

In [None]:
xgbr_comb2 = XGBClassifier(
    random_state = 1,
    n_jobs = -1,
    eval_metric = 'logloss',
    use_label_encoder = False
)
xgbr_comb2.fit(X_train_full, y_train)
eval_result(xgbr_comb2, X_test_full, y_test)

Looks like the last model is the best of all - XGBClassifier on combined data

# Validation

In [None]:
#TF_IDF
bow_val = bow_transformer.transform(df_validation['cleaned all'])
val_tfidf = tfidf_transformer.transform(bow_val)


X_val_tfidf = pd.DataFrame.sparse.from_spmatrix(val_tfidf)
X_val_tfidf.columns = bow_transformer.get_feature_names()

#y
y_val = df_validation['class_']


#SA
sid_df_val_title = pd.DataFrame([sid.polarity_scores(article) for article in df_validation['title']])
sid_df_val_title.columns = ['sent: title_' + col for col in sid_df_val_title.columns]
sid_df_val_text = pd.DataFrame([sid.polarity_scores(article) for article in df_validation['text']])
sid_df_val_text.columns = ['sent: text_' + col for col in sid_df_val_text.columns]

X_val_sid = pd.concat([sid_df_val_title, sid_df_val_text], axis = 1)

##topics
X_val_topics = pd.DataFrame(LDA.transform(bow_val))
X_val_topics.columns = ['topic: ' + groups_summary2[x] for x in X_val_topics.columns]


#lenght
X_val_len = df_validation[['len title', 'len text']]
X_val_len.reset_index(inplace = True, drop = True)


#full
X_val_full = pd.concat([X_val_tfidf, X_val_sid, X_val_topics,  X_val_len], axis = 1)

In [None]:
print(f'X_test_full.shape: {X_test_full.shape}')
print(f'X_val_full.shape: {X_val_full.shape}')

In [None]:
test_columns = set(X_test_full.columns)
val_columns = set(X_val_full.columns)
test_columns ^ val_columns

In [None]:
eval_result(xgbr_comb2, X_val_full, y_val)