In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import os


from bs4 import BeautifulSoup
from textblob import TextBlob

In [2]:
# Import true and fake data

In [3]:
true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv', index_col=False)
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv', index_col=False)

In [4]:
# Some info about dataframes:

In [5]:
true.head()

In [6]:
fake.head()

In [7]:
true.info()

In [8]:
fake.info()

In [9]:
# As we can see, totally we have something about 50k news divided by 
# fake and not real news
# Let's check some null values in dataframes:

In [10]:
true.isna().sum()

In [11]:
fake.isna().sum()

In [12]:
# Okay, now we can concatenate our data's , but before we'll do it, we need 
# to mark them by 0 and 1 (true / fake)

In [13]:
true['target'] = 0

In [14]:
true.head()

In [15]:
fake['target'] = 1

In [16]:
fake.head()

In [17]:
# let's concatenate our data

In [18]:
df = [true, fake]

# Ignore index
data = pd.concat(df, ignore_index=True)

In [19]:
# words count
def words_count(df):
    length = len(str(df).split())
    return length

# characters count

def char_count(df):
    string = df.split()
    x = ''.join(string)
    return len(x)

# hashtags count

def hashtag_count(df):
    hashtag = len([t for t in df.split() if t.startswith('#')])
    return hashtag

# email count

def email_count(df):
    email = len([t for t in df.split() if t.startswith('@')])
    return email

# digits count

def digits_count(df):
    digits = re.findall(r'[0-9]+', df)
    return digits


In [20]:
def get_features(df):
    df['words_count'] = df['text'].apply(lambda x: words_count(x))
    df['char_count'] = df['text'].apply(lambda x: char_count(x))
    df['hashtags_count'] = df['text'].apply(lambda x: hashtag_count(x))
    df['email_counts'] = df['text'].apply(lambda x: email_count(x))
    df['digits_count'] = df['text'].apply(lambda x: digits_count(x))
    
    return df

In [21]:
data = get_features(data)

In [22]:
data.head()

# EDA

In [23]:
plt.style.use('ggplot')

plt.rcParams['figure.figsize'] = [8,4]
plt.rcParams['figure.dpi'] = 120

In [24]:
# Value count on sns.countplot

sns.countplot('target', data=data)
plt.title('Counts')

In [25]:
sns.catplot(x='target', y='words_count', kind='bar', data=data)
plt.xlabel('0 - True news  ||  Fake news - 1')
plt.title('Correlation between number of words and target')

In [26]:
# As we can see, fake news have more words count

In [27]:
sns.catplot(x='target', y='hashtags_count', kind='bar', data=data)
plt.xlabel('0 - True news  ||  Fake news - 1')
plt.title('Correlation between hashtags and target')

In [28]:
sns.catplot(x='target', y='email_counts', kind='bar', data=data)
plt.xlabel('0 - True news  ||  Fake news - 1')
plt.title('Correlation between email and target')

In [29]:
plt.rcParams['figure.figsize'] = [12,8]

sns.catplot(x='target', y='words_count', kind='bar', data=data, hue='subject')
plt.xlabel('0 - True news  ||  Fake news - 1')

# Data Cleaning

In [30]:
data['text'][1]

In [31]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

p_stemming = PorterStemmer()

In [32]:
def nltk_process(data):
    # Tokenization
    tokenList = word_tokenize(data)
    
    # Stemming
    stemedList = []
    for word in tokenList:
        stemedList.append(p_stemming.stem(word))

    # Lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmaList = []
    for word in stemedList:
        lemmaList.append(wordnet_lemmatizer.lemmatize(word))
        
    # Stopwords
    filtered_words = []
    nltk_stop_words = set(stopwords.words("english"))
    for word in lemmaList:
        if word not in nltk_stop_words:
            filtered_words.append(word)
    
    # Remove punct.
    
    for word in filtered_words:
        if word in string.punctuation:
            filtered_words.remove(word)
    
    return filtered_words

In [33]:
data['text'][1]

In [34]:
%%time
data['text'] = data['text'].apply(lambda x: nltk_process(x))

In [35]:
data['text'][1]

In [36]:
type(data['text'][1])

In [37]:
data['text'] = [" ".join(text) for text in data['text'].values]

In [38]:
data['text'][1]

# TF_IDF

In [39]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [40]:
tfidf = TfidfVectorizer(lowercase=False, stop_words='english')

In [41]:
text = data['text']
X = tfidf.fit_transform(text)
y = data['target']

In [42]:
data.head()

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 777)

In [44]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Support Vector Machine

In [45]:
from sklearn.svm import LinearSVC

In [46]:
clf_svc = LinearSVC()
clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)

print(classification_report(y_test, y_pred))

In [47]:
print(confusion_matrix(y_test, y_pred))

In [48]:
clf_svc.score(X_test, y_test)

In [49]:
# Accuracy on the test set is near to 100% !!!