#  Importing all the dependencies

In [134]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [89]:
#  Here's a single-line description for each library/module imported:

# 1. `import numpy as np`:  NumPy for numerical computing and array operations, aliased as `np`.
# 2. `import pandas as pd`: Pandas for data manipulation and analysis, aliased as `pd`.
# 3. `import re`: Regular expressions module for pattern matching and string manipulation.
# 4. `from nltk.corpus import stopwords`: NLTK for natural language processing, importing stopwords corpus.
# 5. `from nltk.stem.porter import PorterStemmer`:  NLTK's Porter Stemmer for word stemming.
# 6. `from sklearn.feature_extraction.text import TfidfVectorizer`:  Scikit-learn's TF-IDF vectorizer for text feature extraction.
# 7. `from sklearn.model_selection import train_test_split`:  Scikit-learn's utility for splitting datasets into training and testing sets.
# 8. `from sklearn.linear_model import LogisticRegression`:  Scikit-learn's logistic regression model for classification tasks.
# 9. `from sklearn.metrics import accuracy_score`:  Scikit-learn's accuracy score metric for evaluating classification model performance.
# 10. from sklearn.svm import SVC :  Support Vector Classifier 

# Importing natural language ToolKit to download stopwords

In [91]:
import nltk  # natural language toolKit
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [92]:
print(stopwords.words('english'))
# This is the list of all english stop words present in 'stopwords'

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

 # Let's preprocess the data


In [93]:
# let's first convert the dataset into a pandas dataframe
df = pd.read_csv('train.csv')

In [94]:
# rows x cols
df.shape

(20800, 5)

In [95]:
# printing the first five rows 
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [96]:
# count the number of missing values
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [97]:
# removed all the rows having atleast one missing value
df=df.dropna()  # or df.dataframe(inplace='True') in this case it will return rather it will update the dataframe
df.shape

# Or we can replace Null values with empty string 

# df=df.fillna('')  // or df.fillna('',inplace='True')
# df.shape

(18285, 5)

In [98]:
# So for our prediction we are going to inlcude title and author and we combine them to use it as data for our prediction
# We are not using text column beacuse it large paragraph and takes too much time for processing

In [99]:
# merging the title and auhtor column together and create a new column named content

df['content'] = df['author']+' '+df['title']
df

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Michael J. de la Merced and Rachel Abrams Macy...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"Alex Ansary NATO, Russia To Hold Parallel Exer..."


In [100]:
# now separe label from data set
x=df.drop('label',axis=1)   # axis=1 for removing column and axis=0 for remving rows
x

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Michael J. de la Merced and Rachel Abrams Macy...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","Alex Ansary NATO, Russia To Hold Parallel Exer..."


In [101]:
y=df['label']
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 18285, dtype: int64

# Let's do stemming

In [102]:
# Stemming is NLP technique used to reduce words to their base or root form, called the "stem" .
# The main purpose of stemming is to normalize words so that variations of the same word are treated as identical tokens.
# Thereby improving text analysis and processing.

# In stemming, common word suffixes are removed from words to extract their root form. For example:

# "running" -> "run"
# "plays" -> "play"
# "happily" -> "happi"
# "cats" -> "cat"

# Stemming is process of reducing a word to its root word. [ actor,acting,actress -> act ]

In [103]:
port_stem=PorterStemmer()

In [104]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # Substitute all characters that are not alphabets with a space.
    stemmed_content = stemmed_content.lower()         # Convert the content to lowercase.
    stemmed_content = stemmed_content.split()         # Split the content into list ofwords using space as delimiter.
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]  # Perform stemming using the Porter Stemmer from NLTK, while excluding stopwords.
    stemmed_content = ' '.join(stemmed_content)       # Join the stemmed words back into a single string.
    return stemmed_content
                                

In [105]:
df['content']=df['content'].apply(stemming)

In [106]:
df['content']

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 18285, dtype: object

In [107]:
x=df['content'].values
y=df['label'].values
# The .values attribute retrieves the values of the content column as a NumPy array

In [108]:
print(x)
print('----------------separator --------------')
print(y)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']
----------------separator --------------
[1 0 1 ... 0 1 1]


In [109]:
# We can perform operations on text so have to convert all these text into numbers

In [110]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(x)

x = vectorizer.transform(x)

In [111]:
print(x)

  (0, 14626)	0.2853880981846006
  (0, 12567)	0.25566372256502734
  (0, 8310)	0.3609049070394367
  (0, 8048)	0.29347549279156676
  (0, 7190)	0.24556189342497173
  (0, 6552)	0.21745594418933306
  (0, 4637)	0.23016077319140021
  (0, 3543)	0.2684494960336511
  (0, 3359)	0.3609049070394367
  (0, 2757)	0.2466340295002162
  (0, 2312)	0.3745612250433202
  (0, 247)	0.26982554594264346
  (1, 15663)	0.3053027963338981
  (1, 6377)	0.19285723710368197
  (1, 5140)	0.7119376870709988
  (1, 3328)	0.2623789770430963
  (1, 2619)	0.19368327535633711
  (1, 2066)	0.38191890436039194
  (1, 1764)	0.1509985164277699
  (1, 1391)	0.29617980713962144
  (2, 14560)	0.4180284001448272
  (2, 8973)	0.4948460479407663
  (2, 5579)	0.3490632212946542
  (2, 5031)	0.38709995799949964
  (2, 2895)	0.4581003415623782
  :	:
  (18282, 12239)	0.252743907968046
  (18282, 11515)	0.2748252773264482
  (18282, 11321)	0.24588400571511215
  (18282, 9605)	0.07665665104558947
  (18282, 8942)	0.1712955017712004
  (18282, 8879)	0.29296479

 # Splitting the dataset into train and test data

In [112]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state=2)

# Logistic Rgression

In [113]:
model = LogisticRegression()

In [114]:
model.fit(X_train, Y_train)

In [115]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [116]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9901558654634947


In [117]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [118]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9827727645611156


# Support Vector Machine

In [119]:
classifier = SVC(kernel="linear") # we need a linear decision boundary
classifier.fit(X_train, Y_train)

In [120]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [121]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9978124145474433


In [122]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [123]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9945310363686082


# Decision Tree

In [126]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, Y_train)

In [128]:
# accuracy score on the training data
X_train_prediction = dt_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [130]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [132]:
# accuracy score on the test data
X_test_prediction = dt_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [133]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.993984140005469


# Random Forest

In [136]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, Y_train)

In [141]:
# accuracy score on the training data
X_train_prediction = rf_classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [142]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [143]:
# accuracy score on the test data
X_test_prediction = rf_classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [144]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9931637954607602


#  Accuracy on test data :
# ---------------------------------

#  Logistic Regression:         0.9827
#  Support Vector Machine:  0.9945
#  Decision Tree:                    0.9939
#  Random Forest:                 0.9931