In [23]:
import os
import pandas as pd
import re
import numpy as np
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from nltk.corpus import stopwords
from scipy import sparse, hstack, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
import pickle
import string

In [2]:
#importing stopwords
STOP_WORDS = stopwords.words('bengali')

In [3]:
#striping punctuations
puncList = ["।", "”", "“", "’"]
for p in string.punctuation.lstrip():
    puncList.append(p)

In [4]:
#word tokenizer
def tokenizer(doc):
    # remove punctuation
    tokens = []
    def cleanword(word):
        for p in puncList:
            word = word.replace(p, "")
        word = re.sub(r'[\u09E6-\u09EF]', "", word, re.DEBUG)  # replace digits

        return word

    for word in doc.split(" "):
        word = cleanword(word)
        if word != "":
            tokens.append(word)

    return tokens

In [5]:
#tf-idf vectorizer

def tfidf_wordF(X):
    tfidf_word = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 1),
                                 stop_words=STOP_WORDS, decode_error='replace',
                                 encoding='utf-8', analyzer='word', tokenizer=tokenizer)


    x_word = tfidf_word.fit_transform(X.values.astype('U'))
    # outfile = open("tfidf_word_pkl", 'wb')
    # pickle.dump(x_word, outfile)
    # outfile.close()
    return x_word

In [6]:
#importing dataset

# df = pd.read_csv("../Data/Corpus/AllDataTarget.csv")
df = pd.read_csv("../Fake News Dataset/Train_Test/TrainTest.csv")
df = df[df["articleID"] != 27753]
consistentID = set(df["articleID"])
print(df.shape)
head = list(df)
X = df.content

(49976, 8)


In [7]:
#calling tf-idf vectorizer
X_word = tfidf_wordF(X)
X = X_word
Y = df[["label"]]
print(X.shape)
print(Y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, Y.values.ravel(), test_size=0.3, random_state=109)

(49976, 72448)
(49976, 1)


## SVM with TF-IDF

In [8]:
# Creating SVM Classifier
clf = svm.SVC(kernel='linear', C=10, cache_size=7000)

clf.fit(X_train, y_train)

#Save Model
# outfile = open("../API/model", 'wb')
# pickle.dump(clf, outfile)
# outfile.close()


y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1-Score:", metrics.f1_score(y_test, y_pred))
print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9926632428466617
Precision: 0.9928610280119663
Recall: 0.999657721796276
F1-Score: 0.9962477827807341
Confusion Matrix: [[  280   105]
 [    5 14603]]
              precision    recall  f1-score   support

           0       0.98      0.73      0.84       385
           1       0.99      1.00      1.00     14608

    accuracy                           0.99     14993
   macro avg       0.99      0.86      0.92     14993
weighted avg       0.99      0.99      0.99     14993



## LR with TF-IDF 

In [9]:
lrclf = LogisticRegression()

lrclf.fit(X_train, y_train)

y_pred = lrclf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1-Score:", metrics.f1_score(y_test, y_pred))
print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))



Accuracy: 0.9782565197091976
Precision: 0.9781706173831526
Recall: 1.0
F1-Score: 0.9889648635840499
Confusion Matrix: [[   59   326]
 [    0 14608]]
              precision    recall  f1-score   support

           0       1.00      0.15      0.27       385
           1       0.98      1.00      0.99     14608

    accuracy                           0.98     14993
   macro avg       0.99      0.58      0.63     14993
weighted avg       0.98      0.98      0.97     14993



## MNB with TF-IDF 

In [17]:
# tune the hyperparameter alpha for the naive bayes classifier
for alpha in np.arange(0,1,.05):
    mnb_classifier_tune = MultinomialNB(alpha=alpha)
    mnb_classifier_tune.fit(X_train, y_train)
    pred_tune = mnb_classifier_tune.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.00 Score: 0.98459
Alpha: 0.05 Score: 0.98459
Alpha: 0.10 Score: 0.98459
Alpha: 0.15 Score: 0.98459
Alpha: 0.20 Score: 0.98459
Alpha: 0.25 Score: 0.98459
Alpha: 0.30 Score: 0.98459
Alpha: 0.35 Score: 0.98459
Alpha: 0.40 Score: 0.98459
Alpha: 0.45 Score: 0.98459
Alpha: 0.50 Score: 0.98459
Alpha: 0.55 Score: 0.98459
Alpha: 0.60 Score: 0.98459
Alpha: 0.65 Score: 0.98459
Alpha: 0.70 Score: 0.98459
Alpha: 0.75 Score: 0.98459
Alpha: 0.80 Score: 0.98459
Alpha: 0.85 Score: 0.98459
Alpha: 0.90 Score: 0.98459
Alpha: 0.95 Score: 0.98459


In [19]:
mnbclf = MultinomialNB(alpha = 0.01)

mnbclf.fit(X_train, y_train)

y_pred = mnbclf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1-Score:", metrics.f1_score(y_test, y_pred))
print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9845928099779897
Precision: 0.9919249982891946
Recall: 0.9922645125958379
F1-Score: 0.9920947263954006
Confusion Matrix: [[  267   118]
 [  113 14495]]
              precision    recall  f1-score   support

           0       0.70      0.69      0.70       385
           1       0.99      0.99      0.99     14608

    accuracy                           0.98     14993
   macro avg       0.85      0.84      0.85     14993
weighted avg       0.98      0.98      0.98     14993



## Passive Agressive Classifier With TF-IDF Vectorizer 

In [20]:
linearclf = PassiveAggressiveClassifier(max_iter=10)

linearclf.fit(X_train, y_train)

y_pred = linearclf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1-Score:", metrics.f1_score(y_test, y_pred))
print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9931301273927833
Precision: 0.9934009116266412
Recall: 0.9995892661555312
F1-Score: 0.9964854812843349
Confusion Matrix: [[  288    97]
 [    6 14602]]
              precision    recall  f1-score   support

           0       0.98      0.75      0.85       385
           1       0.99      1.00      1.00     14608

    accuracy                           0.99     14993
   macro avg       0.99      0.87      0.92     14993
weighted avg       0.99      0.99      0.99     14993



## XGBoost Classifier With TF-IDF Vectorizer

In [21]:
xgbclf = XGBClassifier()

xgbclf.fit(X_train, y_train)

y_pred = xgbclf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1-Score:", metrics.f1_score(y_test, y_pred))
print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))



Accuracy: 0.9870606282932035
Precision: 0.9876843957233726
Recall: 0.9991785323110625
F1-Score: 0.9933982168379502
Confusion Matrix: [[  203   182]
 [   12 14596]]
              precision    recall  f1-score   support

           0       0.94      0.53      0.68       385
           1       0.99      1.00      0.99     14608

    accuracy                           0.99     14993
   macro avg       0.97      0.76      0.84     14993
weighted avg       0.99      0.99      0.99     14993



## CatBoost Classifier With TF-IDF Vectorizer 

In [24]:
catclf = CatBoostClassifier(verbose=0, n_estimators=100)

catclf.fit(X_train, y_train)

y_pred = catclf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1-Score:", metrics.f1_score(y_test, y_pred))
print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.983925832054959
Precision: 0.9843570898793068
Recall: 0.9993838992332968
F1-Score: 0.9918135806243419
Confusion Matrix: [[  153   232]
 [    9 14599]]
              precision    recall  f1-score   support

           0       0.94      0.40      0.56       385
           1       0.98      1.00      0.99     14608

    accuracy                           0.98     14993
   macro avg       0.96      0.70      0.78     14993
weighted avg       0.98      0.98      0.98     14993



## AdaBoost Classifier With TF-IDF Vectorizer 

In [25]:
adaclf = AdaBoostClassifier()

adaclf.fit(X_train, y_train)

y_pred = adaclf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1-Score:", metrics.f1_score(y_test, y_pred))
print("Confusion Matrix:", metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9801907556859868
Precision: 0.9831859004659329
Recall: 0.9967141292442497
F1-Score: 0.989903797124112
Confusion Matrix: [[  136   249]
 [   48 14560]]
              precision    recall  f1-score   support

           0       0.74      0.35      0.48       385
           1       0.98      1.00      0.99     14608

    accuracy                           0.98     14993
   macro avg       0.86      0.67      0.73     14993
weighted avg       0.98      0.98      0.98     14993

