# EDA (Exploratory data analysis)
## For "Fake News" dataset
### Developed by: Sebastián Marroquín

***

### Tasks to do

1. Import the necessary libraries
2. Load the dataset
    - Verify if the data contained in the data set are necessary to perform an analysis, if not, correct them.
3. Obtain and visualize the different types of news contained within the data set
    - Graph these types of news
4. WordCloud
5. Dataset visualization

***

### 1. Import libraries

In [None]:
# Import the libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as  np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
sns.set(style="darkgrid")

### 2. Load the dataset's

In [None]:
realDf = pd.read_csv('True.csv')
fakeDf = pd.read_csv('Fake.csv')

#### Head of Real & Fake New's

In [None]:
realDf.head()

In [None]:
fakeDf.head()

#### Info about the dataframe's

In [None]:
realDf.info()

In [None]:
fakeDf.info()

#### We will add the target value that each data set represents.

In [None]:
realDf['target'] = 1
fakeDf['target'] = 0

#### Show the dataset's tail

In [None]:
realDf.tail()

In [None]:
fakeDf.tail()

### 3. WordCloud

#### Fake news titles in WordCloud

In [None]:
text = fakeDf.title
wordcloud = WordCloud(
    width = 2000,
    height = 1000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (30, 20),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Fake News Titles')
plt.tight_layout(pad=0)
plt.show()

#### Real news in WordCloud

In [None]:
text = realDf.title
wordcloud = WordCloud(
    width = 2000,
    height = 1000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (30, 20),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Real News Titles')
plt.tight_layout(pad=0)
plt.show()

We will combine the data sets in a single variable, this in order to have the target variables of each of the news within the data sets.

In [None]:
mergeData = pd.concat([realDf, fakeDf], ignore_index=True, sort=False)

In [None]:
# Show the tail of the merged data
mergeData.tail()

### 3. Data visualization

#### News distribution by objective variables of false or real

In [None]:
axis = sns.countplot(x='target', data=mergeData, palette='Set2')
axis.set(xticklabels = ['fake', 'real'])
plt.title("Data distribution of Fake & Real New's")

#### News distribution by type of news

In [None]:
plt.figure(figsize=(20,10))
axis = sns.countplot(x='subject', hue='target' ,data=mergeData, palette='Set2')
plt.title("Data distribution of Fake & Real New's by Subject")

#### Count for subject

In [None]:
mergeData.subject.value_counts()

### 4. Data cleaning

In [None]:
def clean_train_data(x):
    text = x
    text = text.lower()
    # remove square brackets
    text = re.sub('\[.*?\]', '', text)
    # remove punctuation
    text = re.sub(r'[^\w\s]','',text) 
    # remove word's containing numbers
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub('\n', '', text)
    return text

In [None]:
cleanMergeData = mergeData.copy()
cleanMergeData['text'] = mergeData.text.apply(lambda x : clean_train_data(x))

In [None]:
cleanMergeData.head()

In [None]:
cleanMergeData.tail()

### 5. Remove the stopwords

In [None]:
enStopWords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_eng_stopwords(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [word for word in token_text if word not in enStopWords]
    join_text = ' '.join(remove_stop)
    return join_text

In [None]:
stopWordMergeData = cleanMergeData.copy()
stopWordMergeData['text'] = cleanMergeData.text.apply(lambda x : remove_eng_stopwords(x))

In [None]:
stopWordMergeData.head()

### 6. Most common words