In [None]:
# import the dependencies
# The re dependency is a regular expression, used for searching for text in a document or paragraph
# nltk - natural language toolkit
# Stemming removes the suffix and prefix of a word, and returns the base of the word
# TfidfVectorizer converts text into feature vectors (numbers)
# stopwords are the words that do not add too much value to the text, eg. articles, where at, etc.


import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# downloading the nltk package 

import nltk
nltk.download('stopwords')

In [None]:
# words that do not add too much to a sentence, stopwords in english

stopwords.words('english')

In [None]:
# data preprocessing 

news_data = pd.read_csv('./train.csv/train.csv')
news_data.shape

In [None]:
news_data.head()

In [None]:
# counting the total number of labels
news_data['label'].value_counts()

In [None]:
# counting the total number of titles
# news_data['title'].value_counts()

In [None]:
# counting the total number of missing values in the dataset
news_data.isnull().sum()

In [None]:
# replacing the null values with empty strings

news_data = news_data.fillna('')


In [None]:
# all the missing data has been replaces with empty strings
news_data.isnull().sum()

In [None]:
# Merging the author and title, they are the ones that will be used in the data analysis.......text cannot be used because they are too large
# We are creating a new column called content, by combining author and title, but we leave a space between the two using the empty strings in between

news_data['content'] = news_data['author'] +' '+ news_data['title']
news_data['content']

In [None]:
news_data.shape

In [None]:
news_data.head()

In [None]:
# creating the inputs and outputs(labels)

X = news_data.drop(columns='label', axis=1)
y = news_data['label']
X, y

In [None]:
X.shape

In [None]:
# stemming procedure - extracting the base form of words by removing the prefix and suffixes using the stopwords
# example actor, actress, acting --> act
# importing the porter stemmer module

port_stem = PorterStemmer()




In [None]:
# Writing a function for the stemming
# Content is the value to be passed into the function
# The first line is the re.sub
# The re.sub, re, is a regular expression that searches through text, and the .sub method kind of filters the the words and returns only alphabets, both small and capital.
# It excludes all other characters like ,. numbers and returns alphabets alone.
# The operation will be done on the content that has been passed over there.
# The .lower function, converts all the alphabets to small letters
# The split function converts all the letters to a list 

def stemming (content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [None]:
# Applying the function to our content column, and returning a new content column with the stemmed words

news_data['content'] = news_data['content'].apply(stemming)


In [None]:
print(news_data['content'])