In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import PassiveAggressiveClassifier

In [None]:
df = pd.read_csv('/content/data.csv')


In [None]:
# Lets start by checking the count of records in each column of the data set.
# If the count of records is lesser than the total number of records i.e. 768, we can conclude that there
# are blank records.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URLs      4009 non-null   object
 1   Headline  4009 non-null   object
 2   Body      3988 non-null   object
 3   Label     4009 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 125.4+ KB


In [None]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

In [None]:
df.shape

(4009, 4)

In [None]:
df.describe()

Unnamed: 0,Label
count,4009.0
mean,0.466949
std,0.498969
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
from numpy import nan


In [None]:
df.isna().any()

URLs        False
Headline    False
Body         True
Label       False
dtype: bool

In [None]:
df.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

In [None]:
df=  df.drop(['URLs'], axis=1)
df = df.dropna()

In [None]:
y = df.Label
X = df.Body

In [None]:
#train_test separation
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)


<b>Note:</b> Machine learning algorithms cannot work with raw text directly. Rather, the text must be converted into vectors of numbers.

## TF-IDF: TF-IDF enables us to gives us a way to associate each word in a document with a number that represents how relevant each word is in that document. Then, documents with similar, relevant words will have similar vectors, which is what we are looking for in a machine learning algorithm.

In [None]:
#Applying tfidf to the data set
tfidf_vect = TfidfVectorizer(stop_words = 'english')
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vect.get_feature_names_out())


In [None]:
#Applying Naive Bayes
clf = MultinomialNB()
clf.fit(tfidf_train, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
pred = clf.predict(tfidf_test)

In [None]:
cm = metrics.confusion_matrix(y_test, pred)
print(cm)


[[371  28]
 [ 15 384]]


In [None]:
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.946
