# Get the data from Kaggle

In [None]:
### This requires an API token .json file from kaggle

### get it from going to https://www.kaggle.com/settings/account and clicking `create new token`

### Then place the token .json file in your google drive, and copy the location in `kaggle_creds_path` variable and command below it

from google.colab import drive
drive.mount("/content/drive")

kaggle_creds_path = "/content/drive/MyDrive/Kaggle/kaggle.json"
! cp /content/drive/MyDrive/Kaggle/kaggle.json .

! pip install kaggle --quiet

!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d saurabhshahane/fake-news-classification

import zipfile
import os
import shutil
location = '/content/fake-news-classification'
zip_ref = zipfile.ZipFile(location+'.zip', 'r')

if os.path.isdir(location):
    shutil.rmtree(location)
    os.mkdir(location)
else:
    os.mkdir(location)

zip_ref.extractall(location)
zip_ref.close()

# Unmount your Google Drive
drive.flush_and_unmount()

# Actual code

In [None]:
#importing Libraries
import numpy as np
import pandas as pd
from matplotlib.pylab import plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk

In [None]:
df = pd.read_csv('/content/fake-news-classification/WELFake_Dataset.csv')
df.head(10)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
y = df.label
print(f'Ratio of real and fake news:')
y.value_counts(normalize=True).rename({1: 'real', 0: 'fake'})

In [None]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
df.isnull().sum().plot(kind="barh")
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df = df.fillna('')

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df["title_text"] = df["title"] + df["text"]
df["body_len"] = df["title_text"].apply(lambda x: len(x) - x.count(" "))
df.head()

In [None]:
bins = np.linspace(0, 200, 40)

plt.hist(df[df["label"]== 1]["body_len"], bins, alpha=0.5, label="Fake", color="#FF5733")
plt.hist(df[df["label"]== 0]["body_len"], bins, alpha=0.5, label="Real", color="#33FFB8")
plt.legend(loc="upper left")
plt.show()

In [None]:
class_names = ['fake', 'real']
label_count = df.label.value_counts()
sns.barplot(x=label_count.index, y=label_count)
plt.title('Distribution of Fake/Real News',fontsize =14)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [None]:
titles = ' '.join(title for title in df['title'])
wordcloud = WordCloud(
    background_color='white',
    max_words=300,
    width=800,
    height=400,
).generate(titles)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
fake_news = X_train[y_train == 0]
real_news = X_train[y_train == 1]
fake_texts = ' '.join(text for text in fake_news)
wordcloud = WordCloud(
    background_color='white',
    max_words=300,
    width=800,
    height=400,
).generate(fake_texts)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300)
model.fit(count_train, y_train)

In [None]:
pred2 = model.predict(count_test)