In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import re
# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd

file_path = '/content/drive/MyDrive/train.csv'
df = pd.read_csv(file_path)


In [6]:
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [7]:
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [8]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
# Remove unnecessary columns
df = df.drop(['id', 'author'], axis=1)

# Remove any rows with missing values
df = df.dropna()

In [10]:
# Preprocessing steps
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define preprocessing function
def preprocess_text(text):
    # Convert text to lowercase

    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    processed_text = ' '.join(tokens)

    return processed_text

# Apply preprocessing to the 'text' column
df['text'] = df['text'].apply(preprocess_text)


In [None]:
!pip install --upgrade tensorflow
!pip install --upgrade keras


In [11]:
df

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,house dem aide didnt even see comeys letter ja...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",ever get feeling life circle roundabout rather...,0
2,Why the Truth Might Get You Fired,truth might get fired october tension intellig...,1
3,15 Civilians Killed In Single US Airstrike Hav...,video civilian killed single u airstrike ident...,1
4,Iranian woman jailed for fictional unpublished...,print iranian woman sentenced six year prison ...,1
...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,rapper unloaded black celebrity met donald tru...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",green bay packer lost washington redskin week ...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,macys today grew union several great name amer...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",nato russia hold parallel exercise balkan pres...,1


In [14]:
# Take a small subset of the DataFrame
subset_df = df.sample(frac=0.1, random_state=42)

# Split the data into features (X) and labels (y)
X = subset_df['text'].values
y = subset_df['label'].values

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Pad sequences
max_sequence_length = max([len(seq) for seq in X])
X = pad_sequences(X, maxlen=max_sequence_length)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
embedding_dim = 100
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 3
batch_size = 32
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

# Save the trained model
model.save('fake_news_classifier_model.h5')




Epoch 1/3
Epoch 2/3

KeyboardInterrupt: ignored

In [16]:
model.save('fake_news_classifier_model.h5')

In [27]:
test_data = pd.read_csv('/test.csv')

In [28]:
test_data

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [23]:
test_data.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [29]:
# Remove unnecessary columns
test_data = test_data.drop(['author'], axis=1)

# Remove any rows with missing values
test_data = test_data.dropna()

In [30]:
# Preprocess the test data
test_sequences = tokenizer.texts_to_sequences(test_data['text'].values)
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Predict the labels for the test data
y_pred = saved_model.predict(test_sequences)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions (0 or 1)

# Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(y_pred.flatten())

# Print the predicted labels
print("Predicted labels:", predicted_labels)

Predicted labels: [0 1 0 ... 0 1 0]


In [31]:
predicted_labels

array([0, 1, 0, ..., 0, 1, 0])

In [None]:
test_data['predicted_labels'] = predicted_labels

In [33]:
test_data

Unnamed: 0,id,title,text,predicted_labels
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...","PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...","If at first you don’t succeed, try a different...",0
4,20804,Keiser Report: Meme Wars (E995),42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Of all the dysfunctions that plague the world’...,0
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,WASHINGTON — Gov. John Kasich of Ohio on Tu...,0
5197,25997,"California Today: What, Exactly, Is in Your Su...",Good morning. (Want to get California Today by...,0
5198,25998,300 US Marines To Be Deployed To Russian Borde...,« Previous - Next » 300 US Marines To Be Deplo...,1


In [34]:
df1 = pd.read_csv('/submit (1).csv')

In [35]:
df1

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1
...,...,...
5195,25995,0
5196,25996,1
5197,25997,0
5198,25998,1


In [42]:
d = {}
for index, row in df1.iterrows():
  if row['id'] not in d:
    d[row['id']] = row['label']


In [45]:
clabels = 0
wlabel = 0
for index, row in test_data.iterrows():
  if row['id'] in d:
    if d[row['id']] == row['predicted_labels']:
      clabels += 1
    else:
      wlabel += 1
print(clabels, wlabel)

3209 1862


In [46]:
print(3209/(3209+1862))

0.6328140406231513
