Dataset Loading

In [1]:
import pandas as pd

train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

print(train_df.head())
print("*******************************************************")
print(test_df.head())


   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  
*******************************************************
      id                                              title  \
0  20800  Specter of Trump L

Dataset Description

In [2]:
train_df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [3]:
test_df.describe()

Unnamed: 0,id
count,5200.0
mean,23399.5
std,1501.255031
min,20800.0
25%,22099.75
50%,23399.5
75%,24699.25
max,25999.0


Data Preprocesing

In [4]:
#HANDLING MISSING VALUES

print(train_df.isnull().sum())
print(test_df.isnull().sum())

train_df.dropna(inplace = True)
test_df.fillna('',inplace = True)

id           0
title      558
author    1957
text        39
label        0
dtype: int64
id          0
title     122
author    503
text        7
dtype: int64


In [5]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

id        0
title     0
author    0
text      0
label     0
dtype: int64
id        0
title     0
author    0
text      0
dtype: int64


In [6]:
pip install nltk



In [7]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
#Tokenization and Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_df['cleaned_text']).toarray()

X_test = tfidf_vectorizer.transform(test_df['cleaned_text']).toarray()


In [9]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['cleaned_text'], train_df['label'], test_size=0.2, random_state=42)


In [10]:
#Vectorize the text data using Tf-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf_vectorizer.fit_transform(X_train).toarray()
X_val_vec = tfidf_vectorizer.transform(X_val).toarray()
X_test_vec = tfidf_vectorizer.transform(test_df['cleaned_text']).toarray()

2.MODEL BUILDING

(I) LOGISTIC REGRESSION MODEL

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_vec, y_train)

# Make predictions on the validation data
y_val_pred_logistic = logistic_model.predict(X_val_vec)

# Evaluate the model on the validation set
print("Logistic Regression Model - Validation Set")
print(classification_report(y_val, y_val_pred_logistic))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_logistic))
print(f'Accuracy: {accuracy_score(y_val, y_val_pred_logistic)}')


Logistic Regression Model - Validation Set
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2082
           1       0.94      0.92      0.93      1575

    accuracy                           0.94      3657
   macro avg       0.94      0.94      0.94      3657
weighted avg       0.94      0.94      0.94      3657

Confusion Matrix:
[[1991   91]
 [ 119 1456]]
Accuracy: 0.9425758818703855


(II) LSTM MODEL

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [13]:
# Tokenize and pad sequences for training, validation, and test sets
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=100)

X_val_seq = tokenizer.texts_to_sequences(X_val)
X_val_pad = pad_sequences(X_val_seq, maxlen=100)

X_test_seq = tokenizer.texts_to_sequences(test_df['cleaned_text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

In [18]:
# Build the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
lstm_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val))

# Make predictions on the validation data
y_val_pred_lstm = (lstm_model.predict(X_val_pad) > 0.5).astype("int32")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
# Evaluate the model on the validation set
print("LSTM Model - Validation Set")
print(classification_report(y_val, y_val_pred_lstm))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_lstm))
print(f'Accuracy: {accuracy_score(y_val, y_val_pred_lstm)}')

LSTM Model - Validation Set
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      2082
           1       0.93      0.89      0.91      1575

    accuracy                           0.92      3657
   macro avg       0.92      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657

Confusion Matrix:
[[1972  110]
 [ 178 1397]]
Accuracy: 0.9212469237079574
