### This model has been built using logistic regression

**importing the dependencies**

In [13]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk

**downloading the stopwords**

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/Archie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**data preprocessing**

In [15]:
news_data = pd.read_csv('./train.csv')

In [16]:
news_data.shape

(20800, 5)

In [17]:
news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

**replacing the missing values with empty string**

In [18]:
news_data = news_data.fillna('')

**merging the author name, title and news**

In [19]:
news_data['content'] = news_data['title']+' '+news_data['author']+' '+news_data['text']

**splitting the data and label**

In [20]:
X = news_data.drop(columns='label', axis=1)
Y = news_data['label']

**stemming the data**

In [21]:
port_stem = PorterStemmer()

In [22]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [23]:
news_data['content'] = news_data['content'].apply(stemming)

In [26]:
X = news_data['content'].values
Y = news_data['label'].values

**converting the text into numerical data**

In [29]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

**splitting data for training and test**

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,stratify=Y,random_state=2)

**training the model**

In [34]:
model = LogisticRegression()

In [35]:
model.fit(X_train,Y_train) 

**accuracy on training data**

In [36]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [37]:
print(f"training data accuracy: {training_data_accuracy}")

training data accuracy: 0.9785456730769231


**accuracy on test data**

In [38]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [39]:
print(f"test data accuracy: {test_data_accuracy}")

test data accuracy: 0.9526442307692308
