# Data extraction

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
gossipcop_fake = pd.read_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/gossipcop_fake.csv')

gossipcop_real = pd.read_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/gossipcop_real.csv')

politifact_fake = pd.read_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/politifact_fake.csv')

politifact_real = pd.read_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/politifact_real.csv')

In [None]:
gossipcop_fake.head()

In [None]:
#Prepare the URL to be used
gossipcop_fake['news_url'] = gossipcop_fake['news_url'].apply(lambda x: 'http://' + str(x))

gossipcop_fake.head()

In [None]:
import requests
from bs4 import BeautifulSoup

## request html script from the site
response = requests.get('http://www.dailymail.co.uk/tvshowbiz/article-5874213/Did-Miley-Cyrus-Liam-Hemsworth-secretly-married.html')

#print(response.text)

In [None]:
pip install newspaper3k

In [None]:
from newspaper import Article
from tqdm import tqdm

article = Article('http://www.dailymail.co.uk/tvshowbiz/article-5874213/Did-Miley-Cyrus-Liam-Hemsworth-secretly-married.html', language='en')

article.download()
article.parse()

art_text = article.text
print("Article's Text:")
print(article.text)

In [None]:
art_text = art_text.replace('\n', '')
print(art_text)
#list_art = list(art_text.split(" "))
#list_art

In [None]:
from newspaper import Article
from tqdm import tqdm

body = []

for url in tqdm(gossipcop_fake['news_url']):
  try:
    article = Article(url, language='en')
    article.download()
    article.parse()
    successful = True
  except:
    successful = False

  if successful:
    body.append(article.text)
  else: 
    body.append('error')

gossipcop_fake_body = pd.DataFrame(data = body, index = gossipcop_fake.index, columns = ['Body'])

In [None]:
gossipcop_fake_body

In [None]:
gossipcop_fake = pd.concat([gossipcop_fake, gossipcop_fake_body], axis=1)
gossipcop_fake

# Pre-processing: bag of words

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_text = pd.read_csv("/content/drive/My Drive/243/FakeNewsNet/dataset/dataframe_text_no_tweets_v2.csv")

In [None]:
df_text.head()

Target = 0 -> the article is false    
Target = 1 -> the article is true

In [None]:
df_text['target'].value_counts()

In [None]:
df_content_gossipcop = df_text.loc[df_text["source"]=='gossipcop']
df_content_gossipcop.head()

In [None]:
content = df_content_gossipcop['body']
content

## Cleaning

In [None]:
#Lowercase
content_lowercase = content.str.lower()

In [None]:
# remove punctuation
from string import punctuation
def remove_punctuation(document):
    no_punct = ''.join([character for character in document if character not in punctuation])
    return no_punct
text_no_punct = content_lowercase.apply(remove_punctuation)

In [None]:
# remove digits
def remove_digit(document): 
    no_digit = ''.join([character for character in document if not character.isdigit()])    
    return no_digit
text_no_digit = text_no_punct.apply(remove_digit)

In [None]:
# tokenization (split words)
import nltk 
nltk.download('punkt')
from nltk.tokenize import word_tokenize

text_tokenized = text_no_digit.apply(word_tokenize)
text_tokenized.head()

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(list):
    return [word for word in list if not word in stop_words]

text_no_stop = text_tokenized.apply(remove_stopwords)
text_no_stop.head()

In [None]:
# stemming (remove end of words)
from nltk.stem import PorterStemmer
porter = PorterStemmer()
def stemmer(document):
    stemmed_document = [porter.stem(word) for word in document]
    return stemmed_document
text_stemmed = text_no_stop.apply(stemmer)
text_stemmed.head()

In [None]:
# detokenization (merge words)
from nltk.tokenize.treebank import TreebankWordDetokenizer
text_detokenized = text_stemmed.apply(TreebankWordDetokenizer().detokenize)

## Most frequent words analysis

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()

sparse_dtm = countvec.fit_transform(text_detokenized)
dtm = pd.DataFrame(sparse_dtm.toarray(), columns=countvec.get_feature_names(), index=df_content_gossipcop.index)

In [None]:
frequencies = dtm.sum().sort_values(ascending=False)
print(frequencies[frequencies > 50])

In [None]:
plt.figure(figsize=(8,6))

ax = sns.countplot(frequencies)
plt.xticks(np.arange(1, 50, step=5), np.arange(1, 50, step=5))

# freq = pd.DataFrame(frequencies, columns=['Frequencies']).transpose()
# ax = sns.countplot(freq)
# ax = sns.distplot(frequencies, bins=len(frequencies))
# ax = plt.hist(frequencies[frequencies > 50])

plt.xlabel('terms')
plt.ylabel(' ')
plt.show()

In [None]:
# 5% of the tweets or more (appears in 6 or more):
from sklearn.feature_extraction.text import CountVectorizer
countvec2 = CountVectorizer(min_df=0.05)
sparse_dtm2 = countvec2.fit_transform(text_detokenized)

dtm2 = pd.DataFrame(sparse_dtm2.toarray(), columns=countvec2.get_feature_names(), index=df_content_gossipcop['body'].index)
dtm2.sum().sort_values(ascending=False) 

In [None]:
dtm2.head()

# Data Analysis: Number of tweets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_text = pd.read_csv("/content/drive/My Drive/243/FakeNewsNet/dataset/dataframe_text_no_tweets_v2.csv")

In [None]:
df_text.head()

## Sum

In [None]:
sum_tweets = df_text.groupby(['target']).agg({'number_tweets':'sum'})
sum_tweets

Target = 0: article is false    
Target = 1: article is true

In [None]:
plt.bar(x = sum_tweets.index, height = sum_tweets['number_tweets'], color = ['orange', 'green'], width = 0.6)
plt.title('Sum of tweets reacting to Fake and Real News')
plt.xticks([0,1], ['Fake', 'Real']) 
plt.yticks(range(0,1100000,200000)) 

## Mean

In [None]:
mean_tweets = df_text.groupby(['target']).agg({'number_tweets':'mean'})
mean_tweets

In [None]:
mean_tweets = df_text.groupby(['target']).agg({'number_tweets':'std'})
mean_tweets

In [None]:
plt.bar(x = mean_tweets.index, height = mean_tweets['number_tweets'], color = ['orange', 'green'], width = 0.6)
plt.title('Mean of the number of tweets reacting to Fake and Real News')
plt.xticks([0,1], ['Fake', 'Real']) 

## According to the source

### Sum

In [None]:
sum_tweets_source = df_text.groupby(['target', 'source']).agg({'number_tweets':'sum'})
sum_tweets_source

In [None]:
sum_tweets_source.index

In [None]:
plt.bar(x = ['False, gossipcop','False, politifact','True, gossipcop','True, politifact'], 
        height = sum_tweets_source['number_tweets'], color = ['blue', 'orange'], width = 0.6)
plt.title('Sum of tweets reacting to Fake and Real News according to the source')

### Mean

In [None]:
mean_tweets_source = df_text.groupby(['target', 'source']).agg({'number_tweets':'mean'})
mean_tweets_source

In [None]:
plt.bar(x = ['False, gossipcop','False, politifact','True, gossipcop','True, politifact'], 
        height = mean_tweets_source['number_tweets'], color = ['blue', 'orange'], width = 0.6)
plt.title('Mean of the number of tweets reacting to Fake and Real News according to the source')

### Standard deviation

In [None]:
std_tweets_source = df_text.groupby(['target', 'source']).agg({'number_tweets':'std'})
std_tweets_source

In [None]:
plt.bar(x = ['False, gossipcop','False, politifact','True, gossipcop','True, politifact'], 
        height = std_tweets_source['number_tweets'], color = ['blue', 'orange'], width = 0.6)
plt.title('Standard deviation of the number of tweets reacting to Fake and Real News according to the source')

## Distribution of the data

### False vs True articles

In [None]:
df_text_false = df_text.where(df_text['target'] == 0)
df_text_false

#Why are there so many Nan ?

In [None]:
df_text_false = df_text_false.dropna(subset = ['number_tweets'], axis=0)
df_text_false

In [None]:
df_text_false.describe()

Only the column number_tweets is relevant.    
As we can see, the standard deviation is very high, so we can conclude that the values are very scattered, and the mean might not be very relevant. By looking at the quartiles (and especially the median and the 75% quartile), we can conclude that there are a lot of values very low, and very few articles which generates a huge amount of tweets (probably the hot topic).    
In that case, it might be more relevant to study the median instead of the mean.

In [None]:
df_text_true = df_text.where(df_text['target'] == 1)
df_text_true = df_text_true.dropna(subset = ['number_tweets'], axis=0)
df_text_true

In [None]:
df_text_true.describe()

Only the column number_tweets is relevant.    
As we can see, the standard deviation is very high, so we can conclude that the values are very scattered, and the mean might not be very relevant. By looking at the quartiles (and especially the median and the 75% quartile), we can conclude that there are a lot of values very low, and very few articles which generates a huge amount of tweets (probably the hot topic).    
In that case, it might be more relevant to study the median instead of the mean.

In [None]:
from matplotlib.patches import Rectangle

bins = range(0,1500, 50)
plt.hist(df_text_true['number_tweets'], bins = bins, alpha=0.5)
plt.hist(df_text_false['number_tweets'], bins = bins, alpha=0.5)

colors = ["blue", "orange"]
handles = [Rectangle((0, 0), 1, 1, color=c, ec="k") for c in colors]
labels = ["True articles", "False articles"]
plt.legend(labels)
plt.title('Distribution of the number of tweets reacting to Fake and Real News')

**Comparison**

In [None]:
comparison = pd.concat([df_text_true.describe()['number_tweets'],df_text_false.describe()['number_tweets']], axis = 1)
#comparison.rename(columns = {'number_tweets':'True articles', 'number_tweets':'False articles'}, inplace = True) 
comparison

The mean is twice higher when the article is false, the standard deviation as well (the number of tweets varies more). By looking at the quartiles, it is because only a few false articles generate a lot of tweets.

In [None]:
real_tweets = df_text_true['number_tweets'].to_list
fake_tweets = df_text_false['number_tweets'].to_list

In [None]:
import pylab

plt.boxplot([df_text_true['number_tweets'].values, df_text_false['number_tweets'].values])
pylab.xticks([1,2], ['Real News', 'Fake News'])
plt.title('Distribution of the number of tweets reacting to Fake and Real News')
plt.show()

In [None]:
plt.boxplot([df_text_true['number_tweets'].values, df_text_false['number_tweets'].values])
plt.ylim(0,250)
pylab.xticks([1,2], ['Real News', 'Fake News'])
plt.title('Distribution of the number of tweets reacting to Fake and Real News')
plt.show()

### False vs True according to the source of the article


In [None]:
df_text_false_gossipcop = df_text_false.where(df_text_false['source'] == 'gossipcop')
df_text_false_gossipcop = df_text_false_gossipcop.dropna(subset = ['number_tweets'], axis=0)
df_text_false_gossipcop

In [None]:
df_text_true_gossipcop = df_text_true.where(df_text_true['source'] == 'gossipcop')
df_text_true_gossipcop = df_text_true_gossipcop.dropna(subset = ['number_tweets'], axis=0)

df_text_false_politifact = df_text_false.where(df_text_false['source'] == 'politifact')
df_text_false_politifact = df_text_false_politifact.dropna(subset = ['number_tweets'], axis=0)

df_text_true_politifact = df_text_true.where(df_text_true['source'] == 'politifact')
df_text_true_politifact = df_text_true_politifact.dropna(subset = ['number_tweets'], axis=0)

In [None]:
comparison = pd.concat([df_text_false_gossipcop.describe()['number_tweets'],df_text_true_gossipcop.describe()['number_tweets'],
                        df_text_false_politifact.describe()['number_tweets'],df_text_true_politifact.describe()['number_tweets']], axis = 1)
print('False - gossipcop, True - gossipcop, False - politifact, True - politifact')
comparison

Articles from the website politifact generate more tweets than those from the article gossipcop. Although, the number of tweets reacting to an article varies a lot according to the article for politifact than for gossipcop.    
The maximum number of tweets for an article is ten times higher for politifact than for gossipcop. But, for both sources, we notice that the 75% quartiles is quite compared to the mean and the maximum, which shows again that a few articles generated a lot of tweets, independently of if the article is true or false.

**Politifact**

In [None]:
bins = range(0,1500, 50)
plt.hist(df_text_false_politifact['number_tweets'], bins = bins, alpha=0.5)
plt.hist(df_text_true_politifact['number_tweets'], bins = bins, alpha=0.5)
plt.title('Histogram of the number of tweets for the article from politifact')
colors = ["orange", "blue"]
handles = [Rectangle((0, 0), 1, 1, color=c, ec="k") for c in colors]
labels = ["False articles", "True articles"]
plt.legend(labels)

**Gossipcop**

In [None]:
bins = range(0,1500, 50)
plt.hist(df_text_false_gossipcop['number_tweets'], bins = bins, alpha=0.5)
plt.hist(df_text_true_gossipcop['number_tweets'], bins = bins, alpha=0.5)
plt.title('Histogram of the number of tweets for the article from gossipcop')
colors = ["orange", "blue"]
handles = [Rectangle((0, 0), 1, 1, color=c, ec="k") for c in colors]
labels = ["False articles", "True articles"]
plt.legend(labels)

# Data Pre-processing

## Load in data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
merge_df = pd.read_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/merge_df.csv')
merge_df

In [None]:
merge_df['id'].value_counts()

## Modify the index

In [None]:
merge_df.index

In [None]:
#Add 'g_' for gossipcop and 'p_' for politifact in front of the id
for i in merge_df.index:
  if merge_df.loc[i,'source'] == 'gossipcop':
    merge_df.loc[i,'index'] = 'g_' + str(merge_df.loc[i,'id'])
  else:
    merge_df.loc[i,'index'] = 'p_' + str(merge_df.loc[i,'id'])
merge_df

In [None]:
merge_df.set_index('index')

## Drop the columns

In [None]:
merge_df = merge_df.drop(columns = ['id', 'source', 'news_url'])

## Source URL: keep only the 10 more frequent

In [None]:
merge_df['source_url'].value_counts().head(10)

In [None]:
source_to_keep = merge_df['source_url'].value_counts().index[:10]
source_to_keep

In [None]:
'today' in source_to_keep

In [None]:
for i in merge_df.index:
  if merge_df.loc[i,'source_url'] not in source_to_keep:
    merge_df.loc[i,'source_url'] = 'other'
merge_df

In [None]:
merge_df['source_url'].value_counts()

In [None]:
merge_df_enc = pd.get_dummies(merge_df, columns = ['source_url'])
merge_df_enc

## Split

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(merge_df_enc, test_size=0.25, random_state=88) #spliter un set de data

In [None]:
df_train2, df_val = train_test_split(df_train, test_size=0.13, random_state=88) #spliter un set de data

In [None]:
print('Training set:', len(df_train2)/len(merge_df_enc), '; Testing set:',len(df_test)/len(merge_df_enc),'; Validation set:',len(df_val)/len(merge_df_enc),)

In [None]:
df_train2.to_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/clean_merged_data_train.csv')
df_test.to_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/clean_merged_data_test.csv')
df_val.to_csv('/content/drive/MyDrive/243/FakeNewsNet/dataset/clean_merged_data_val.csv')