In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.cm as cm

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk import punkt
from nltk.corpus import stopwords, words
import re
import string

from itertools import combinations


In [11]:
train_df = pd.read_csv('Datasets\Train.csv')
test_df = pd.read_csv('Datasets\Test.csv')
df = pd.merge(test_df, train_df, on=('id','title', 'author', 'text'), how='outer', suffixes=('_', '_'))
df.head(10)

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,
5,20805,Trump is USA's antique hero. Clinton will be n...,,Trump is USA's antique hero. Clinton will be n...,
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori...",
7,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...,
8,20808,Urban Population Booms Will Make Climate Chang...,,Urban Population Booms Will Make Climate Chang...,
9,20809,,cognitive dissident,don't we have the receipt?,


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26000 entries, 0 to 25999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      26000 non-null  int64  
 1   title   25320 non-null  object 
 2   author  23540 non-null  object 
 3   text    25954 non-null  object 
 4   label   20800 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.2+ MB


In [28]:
def cleaning_text(news_articles):
    #We will begin by Lower case
    news_articles = news_articles.lower()

    #And then we will remove the numbers from the text.
    news_articles = "".join([i for i in news_articles if not i.isdigit()])

    #We will then remove the punctuations
    news_articles = "".join(
        [i for i in news_articles if i not in string.punctuation])

    #We will then breakdown the text sentences into smaller portions.
    tokens = word_tokenize(news_articles)

    #Removing stop words and then we will begin the Lemmatization process

    lemmatizer = WordNetLemmatizer()

    stop_words = stopwords.words('english')

    news_articless = [
        lemmatizer.lemmatize(w) for w in tokens if w not in stop_words
    ]

    return news_articles

In [21]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rayni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rayni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rayni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)

In [52]:
labels = df.label
labels

5200     1.0
5201     0.0
5202     1.0
5203     1.0
5204     1.0
        ... 
25995    0.0
25996    0.0
25997    0.0
25998    1.0
25999    1.0
Name: label, Length: 18285, dtype: float64

In [53]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')


#We then create a function to remove the stopwords in our text.
def remove_stopwords(text):
    text_Array = text.split(' ')
    remove_words = " ".join([i for i in text_Array if i not in stop_words])
    return remove_words


#And here we will apply the remove_stopwords function. This will remove the stopwords from our dataset's text
df['content'] = df['title'] + " " + df['author']
df['content'] = df['content'].apply(remove_stopwords)

In [54]:

df.head(10)

Unnamed: 0,id,title,author,text,label,content
5200,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0,House Dem Aide: We Didn’t Even See Comey’s Let...
5201,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0.0,"FLYNN: Hillary Clinton, Big Woman Campus - Bre..."
5202,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1.0,Why Truth Might Get You Fired Consortiumnews.com
5203,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1.0,15 Civilians Killed In Single US Airstrike Hav...
5204,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1.0,Iranian woman jailed fictional unpublished sto...
5205,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0.0,Jackie Mason: Hollywood Would Love Trump He Bo...
5207,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0.0,Benoît Hamon Wins French Socialist Party’s Pre...
5209,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0.0,"A Back-Channel Plan Ukraine Russia, Courtesy T..."
5210,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that...",0.0,Obama’s Organizing Action Partners Soros-Linke...
5211,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...,0.0,"BBC Comedy Sketch ""Real Housewives ISIS"" Cause..."


In [55]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(df['content'].values)
tfidf = transformer.fit_transform(counts)

In [56]:
targets = df['label'].values
targets

array([1., 0., 1., ..., 0., 1., 1.])

In [57]:
x_train, x_test, y_train, y_test = train_test_split(tfidf,
                                                    targets,
                                                    test_size=0.2,
                                                    random_state=49)


In [58]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report, roc_curve


def train(model, model_name):
    model.fit(x_train, y_train)
    print(
        f"Training accuracy of {model_name} is {model.score(x_train,y_train)}")
    print(f"testing accuracy of {model_name} is {model.score(x_test,y_test)}")


def conf_matrix(model):
    ConfusionMatrixDisplay.from_estimator(model, x_test, y_test)


def class_report(model):
    print(classification_report(y_test, model.predict(x_test)))




In [63]:
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier

model_svc = SVC()
model_lr = LinearRegression()
model_rfc = RandomForestClassifier(random_state=42)
params = {"n_estimators": range(50, 125, 25), "max_depth": range(60, 81, 2)}
rfc_model = GridSearchCV(model_rfc, param_grid=params, cv=5, n_jobs=-1, verbose=1)
model_dtc = DecisionTreeClassifier(max_depth=58, random_state=42)
model_dtc.fit(x_train,y_train)

train(model_svc, 'SVC')
train(model_lr, 'Linear Regression')
train(rfc_model, 'Random Forest Classifier')
train(model_dtc, 'Decision Tree Classifier')

Training accuracy of SVC is 0.9998632759092152
testing accuracy of SVC is 0.9860541427399507
Training accuracy of Linear Regression is 0.9999999999999136
testing accuracy of Linear Regression is 0.9230258182115304
Fitting 5 folds for each of 33 candidates, totalling 165 fits
Training accuracy of Random Forest Classifier is 0.9872846595570139
testing accuracy of Random Forest Classifier is 0.9685534591194969
Training accuracy of Decision Tree Classifier is 0.9976073284112661
testing accuracy of Decision Tree Classifier is 0.9945310363686082


In [65]:
class_report(model_svc)


              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      2126
         1.0       0.97      1.00      0.98      1531

    accuracy                           0.99      3657
   macro avg       0.98      0.99      0.99      3657
weighted avg       0.99      0.99      0.99      3657



In [68]:
class_report(rfc_model)

              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97      2126
         1.0       0.93      1.00      0.96      1531

    accuracy                           0.97      3657
   macro avg       0.97      0.97      0.97      3657
weighted avg       0.97      0.97      0.97      3657



In [69]:
class_report(model_dtc)

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      2126
         1.0       0.99      1.00      0.99      1531

    accuracy                           0.99      3657
   macro avg       0.99      0.99      0.99      3657
weighted avg       0.99      0.99      0.99      3657

