In [1]:
#import necessary libraries
import numpy as np
import pandas as pd


In [2]:
#load the dataset from 'train.csv' and 'test.csv'
train = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv')

Preparing Data

In [3]:
# Display the first five rows of the DataFrame to get a quick overview of the data
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [5]:
#checking the total number of rows and columns in the dataset
print(train.shape, test.shape)

(20800, 5) (5200, 4)


<h3>Dropping Duplicate Rows</h3>
in machine learning, duplicates can cause models to be biased towards the overrepresented data, potentially leading to overfitting.
Duplicate rows increase the volume of data to be processed, which can unnecessarily consume computational resources and time. Removing duplicates can make data processing more efficient and faster, especially with large datasets.

In [6]:
#checkhing the number of duplicate rows in train dataset
duplicate_rows = train[train.duplicated()]
print("number of duplicate rows: ", duplicate_rows.shape)

number of duplicate rows:  (0, 5)


In [7]:
#dropping the duplicate rows
train = train.drop_duplicates()
train.shape

(20800, 5)

<h3>Replacing Null Values</h3> 
In the context of pandas DataFrames, when a data type of object, it generally means that the column contains text data, or more specifically, data that is treated as Python strings. It can also include more general Python objects, but in most cases involving real-world datasets, especially with columns labeled title, author, and text, the object dtype signifies that these columns are storing strings. So, the missing values can be replaced with empty string.
For textual data, replacing nulls with empty strings might be more suitable than dropping rows containing null values.

In [8]:
#Checking the number of missing or null values in each column
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
#Checking the types of data
train.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

In [10]:
#doing the same for test dataset
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [11]:
test.dtypes

id         int64
title     object
author    object
text      object
dtype: object

In [12]:
#Replace null values with the empty string
train['title'] = train['title'].fillna('')
train['author'] = train['author'].fillna('')
train['text'] = train['text'].fillna('')

test['title'] = test['title'].fillna('')
test['author'] = test['author'].fillna('')
test['text'] = test['text'].fillna('')

<h3>Preprocessing and Analysis of News Column</h3>
This step will:
1 remove all the stopwords, punctuations and any irrelevant spaces from the text.
2 convert each letter to lowercase.
3 convert each word to its root form which is known as stemming. 
For that NLTK Library is required and some of it’s module need to be downloaded.

In [15]:
from tqdm import tqdm 
import re 
import nltk 
nltk.download('punkt') 
nltk.download('stopwords') 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nick\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nick\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [16]:
#function for preprocessing text data
def preprocess_text(text_data):
    preprocessed_text = []
    porter_stemmer = PorterStemmer()

    for sentence in tqdm(text_data):
        # Remove non-alphanumeric characters
        sentence = re.sub(r'[^\w\s]', '', sentence)
        
        # Tokenize, remove stopwords, and apply stemming
        processed_tokens = [porter_stemmer.stem(token.lower())
                            for token in word_tokenize(sentence)
                            if token not in stopwords.words('english')]
        preprocessed_text.append(' '.join(processed_tokens))

    return preprocessed_text

In [17]:
# merging the author name and news title and text
train['news'] = train['title']+' '+train['author']+' '+train['text']
test['news'] = test['title']+' '+test['author']+' '+test['text']

In [18]:
#apply preprocessing to the train and test data
preprocessed_train = preprocess_text(train['news'].values) 
train['news'] = preprocessed_train
preprocessed_test = preprocess_text(test['news'].values) 
test['news'] = preprocessed_test

100%|██████████| 20800/20800 [56:59<00:00,  6.08it/s]  
100%|██████████| 5200/5200 [14:18<00:00,  6.06it/s]


<h3>Converting Text into Vectors</h3>
The conversion of text features into vectors, a process known as vectorization, is 
fundamental in the field of natural machine learning. Most machine learning algorithms and 
models are designed to work with numerical data. They perform mathematical operations on the input data to make predictions or classifications. Since text data is inherently non-numeric (comprising words, sentences, etc.), it must be converted into a numerical format 
that these algorithms can process. One of the vectorization technique is Tf-Idf, which I will use here.

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer 
vectorization = TfidfVectorizer() 

x_train = vectorization.fit_transform(train['news']) 
x_test = vectorization.transform(test['news'])

In [20]:
#training a Logistic Regression model on the training data and labels
from sklearn.linear_model import LogisticRegression 
  
model = LogisticRegression() 
model.fit(x_train, train['label']) 

In [21]:
#predicting for test data
pred = model.predict(x_test)
pred

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

In [22]:
#creating 'label' column for test dataframe
test['label'] = pred
test

Unnamed: 0,id,title,author,text,news,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",specter trump loosen tongu not purs string sil...,0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,russian warship readi strike terrorist near al...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,nodapl nativ american leader vow stay all wint...,1
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",tim tebow will attempt anoth comeback thi time...,0
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,keiser report meme war e995 truth broadcast ne...,1
...,...,...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,the bangladeshi traffic jam that never end the...,0
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,john kasich sign one abort bill ohio veto more...,0
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,california today what exactli is your sushi th...,0
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,300 us marin to be deploy to russian border in...,1


In [23]:
#dropping the columns except 'id' and 'label'
test.drop(['title','author','text','news'],axis='columns',inplace=True)

In [24]:
# Save the updated test DataFrame as 'submit.csv' to a new CSV file
test.to_csv("submission.csv", index=False)
test.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
