In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Sklearn and nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
np.random.seed(42) #Reproducibility of results

In [None]:
Corpus = pd.read_csv("/kaggle/input/fake-news/train.csv")

In [None]:
Corpus.head()

In [None]:
Corpus.tail()

In [None]:
Corpus.info()

In [None]:
Corpus.shape

In [None]:
# Get X
X=Corpus.drop('label', axis=1)

X.head()

In [None]:
#get Y
Y = Corpus['label']

Y.head()

In [None]:
Y.value_counts()  ## Similar count of both output hence simitric dataset

In [None]:
#Converting the text feature from a float object to a string object
#Corpus['text'] = str(Corpus['text'])
print(isinstance(Corpus['text'],(str,)))
print(Corpus['text'])

Its a text data, containing title and text with label - fake = 1, not fake = 0

# **Data Preprocessing**

Steps:  

1. Remove Blank rows in Data, if any
2. Change all the text to lower case
3. Word Tokenization
4. Remove Stop words
5. Remove Non-alpha text
6. Word Lemmatization

In [None]:
#Geting punkt and wordnet
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# Step - a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [str(entry).lower() for entry in Corpus['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(str(entry)) for entry in Corpus['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [None]:
#Preparing train and test sets
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.2)

In [None]:
#Word vectorization
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [None]:
print(Tfidf_vect.vocabulary_)

In [None]:
print(Train_X_Tfidf)

In [None]:
#Naive Bayes
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

In [None]:
#SVM Classfier
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

#def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
 #   model = LogisticRegression(C=_C).fit(X_tr, y_tr)
  #  score = model.score(X_test, y_test)
   # print('Test Score with', description, 'features', score)
    #return model

log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
model=log_reg.fit(Train_X_Tfidf,Train_Y)
predictions_Logreg = log_reg.predict(Test_X_Tfidf)
print("Logistic Regression Accuracy Score -> ",accuracy_score(predictions_Logreg, Test_Y)*100)

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model = random_forest.fit(Train_X_Tfidf, Train_Y)
predictions_random=random_forest.predict(Test_X_Tfidf)
print("Random Forest Accuracy Score -> ",accuracy_score(predictions_random, Test_Y)*100)

In [None]:
#Preparing the test set for prediction
Corpus_test = pd.read_csv("/kaggle/input/fake-news/test.csv")

#Geting punkt and wordnet
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# Step - a : Remove blank rows if any.
Corpus_test['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus_test['text'] = [str(entry).lower() for entry in Corpus_test['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus_test['text']= [word_tokenize(str(entry)) for entry in Corpus_test['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus_test['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus_test.loc[index,'text_final'] = str(Final_words)

In [None]:
#Word vectorization
X_test = Corpus_test['text_final']
Tfidf_vect_test = TfidfVectorizer(max_features=5000)
Tfidf_vect_test.fit(X_test)
X_test = Tfidf_vect.transform(X_test)

In [None]:
print(Tfidf_vect_test.vocabulary_)

In [None]:
#Predictions using SVM
label_predictions = SVM.predict(X_test)

In [None]:
output = pd.DataFrame({'id' : Corpus_test['id'], 'label' : label_predictions})
output.to_csv('submit.csv', index= False)