### Importing Modules

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from sklearn.model_selection import train_test_split  # for cross-validation
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

### Importing Dataset

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.shape

(20800, 5)

### Feature Engineering

In [5]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
df=df.dropna()

In [7]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [8]:
df.shape

(18285, 5)

In [9]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [10]:
## Get the Dependent features
y=df['label']

In [11]:
### Vocabulary size
voc_size=5000

In [12]:
messages=X.copy()
messages['title'][1]
messages.reset_index(inplace=True)

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Stemming

In [14]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

### One-Hot-Encoding

In [15]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[2557, 1089, 95, 3300, 2063, 161, 4720, 3941, 3422, 1135],
 [2688, 486, 733, 3781, 4698, 1107, 602],
 [2001, 1657, 2859, 924],
 [4108, 2444, 4730, 1383, 2837, 4984],
 [2147, 4698, 1890, 820, 2929, 4341, 4698, 1587, 4, 740],
 [293,
  4408,
  4134,
  1503,
  3215,
  1330,
  3659,
  3361,
  575,
  898,
  1240,
  2947,
  1072,
  3293,
  602],
 [1897, 721, 2837, 2608, 3352, 1773, 1105, 4123, 3279, 1486, 92],
 [2262, 355, 2961, 2952, 715, 2766, 1330, 2764, 3279, 1486, 92],
 [4514, 3608, 2152, 2853, 4322, 3211, 3291, 2223, 1330, 4395],
 [4697, 2333, 97, 7, 4890, 3388, 2657, 419],
 [2015, 3195, 2631, 2139, 1505, 338, 361, 3582, 4753, 1383, 1343],
 [1383, 111, 2063, 3211, 1330, 715],
 [2036, 2387, 1182, 3388, 4154, 4283, 1455, 2947, 1165],
 [727, 4901, 1850, 3349, 3547, 3926, 1088, 3279, 1486, 92],
 [1757, 2601, 1876, 4627, 1292, 3279, 1486, 92],
 [3054, 4589, 3807, 3710, 2491, 1383, 1671, 186, 2399, 4321],
 [2092, 2071, 486],
 [2848, 3249, 1611, 4839, 1330, 1714, 2568, 602],
 [4253, 2924, 733

In [16]:
corpus[1]

'flynn hillari clinton big woman campu breitbart'

### Embedding Layer

In [17]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[2557 1089   95 ...    0    0    0]
 [2688  486  733 ...    0    0    0]
 [2001 1657 2859 ...    0    0    0]
 ...
 [ 784 2215 1796 ...    0    0    0]
 [4141  715 1683 ...    0    0    0]
 [1217 4883 3998 ...    0    0    0]]


In [18]:
onehot_repr[0]

[2557, 1089, 95, 3300, 2063, 161, 4720, 3941, 3422, 1135]

In [19]:
embedded_docs[1]

array([2688,  486,  733, 3781, 4698, 1107,  602,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

### Creating a Neural Network

In [20]:
## Creating model
embedding_vector_features=40 ##features representation
model=Sequential()
model.add(Embedding(voc_size+1,embedding_vector_features, input_shape=(20,)))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

  super().__init__(**kwargs)


None


In [21]:
len(embedded_docs),y.shape

(18285, (18285,))

In [22]:
model.summary()

### Creating our Data an array

In [23]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [24]:
X_final.shape,y_final.shape

((18285, 20), (18285,))

### Train Test Split and Training the Data

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)


In [26]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.7838 - loss: 0.4058 - val_accuracy: 0.9152 - val_loss: 0.2124
Epoch 2/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.9517 - loss: 0.1311 - val_accuracy: 0.9139 - val_loss: 0.2028
Epoch 3/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.9624 - loss: 0.1036 - val_accuracy: 0.9234 - val_loss: 0.2118
Epoch 4/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.9764 - loss: 0.0699 - val_accuracy: 0.9171 - val_loss: 0.2521
Epoch 5/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.9813 - loss: 0.0566 - val_accuracy: 0.9040 - val_loss: 0.2981
Epoch 6/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.9864 - loss: 0.0430 - val_accuracy: 0.9051 - val_loss: 0.3340
Epoch 7/10
[1m229/229

<keras.src.callbacks.history.History at 0x223117bc590>

### predicting the model

In [27]:
y_pred=model.predict(X_test)

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


In [28]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

### Accuracy and Confusion metrics

In [29]:
from sklearn.metrics import confusion_matrix

In [30]:
confusion_matrix(y_test,y_pred)

array([[1868,  214],
       [ 135, 1440]], dtype=int64)

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9045665846322122

In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.90      0.91      2082
           1       0.87      0.91      0.89      1575

    accuracy                           0.90      3657
   macro avg       0.90      0.91      0.90      3657
weighted avg       0.91      0.90      0.90      3657



### Importing the data which we have to Predict

In [2]:
import pandas as pd
test= pd.read_csv('test.csv')

### Filling missing values with another object

In [3]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [55]:
test = test.fillna("missing")

In [57]:
messages_test=test.copy()

In [64]:
messages_test['title'][9]

'missing'

In [79]:
test.head(5200)

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,missing,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,missing,« Previous - Next » 300 US Marines To Be Deplo...


### Stemming

In [82]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0,5199):
    review = re.sub('[^a-zA-Z]', ' ', messages_test['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [83]:
corpus[1]

'russian warship readi strike terrorist near aleppo'

### One hot Encoding

In [84]:
onehot_repr_test=[one_hot(words,voc_size)for words in corpus] 
onehot_repr_test

[[4486, 1330, 36, 434, 2031, 1333, 2513, 1240, 3279, 1486, 92],
 [2015, 4335, 576, 325, 4172, 3112, 2815],
 [4380, 4770, 490, 2894, 1517, 856, 3283, 2229, 1407, 3382],
 [1559, 4817, 1903, 1021, 462, 92, 3479, 3279, 1486, 92],
 [867, 2597, 1344, 1967, 3337],
 [1330, 1966, 4900, 2752, 733, 2212, 3044],
 [3608, 3136, 2092, 1269, 3547, 2015, 4839, 1330, 602],
 [2148, 1229, 3581, 2707, 4783],
 [367, 597, 4009, 1155, 1544, 3546, 1721],
 [404],
 [3621, 448, 4981, 4482, 1330, 2126, 1876],
 [1836, 1377, 2752, 4729, 439],
 [713, 4505, 2475, 1164, 1929, 1614, 4505],
 [3796, 1154, 4960, 2170],
 [4551, 2238, 2446, 1976, 129, 2387, 1217, 4027, 2385, 1330, 1941, 1966, 4281],
 [4997, 2549, 3879, 3546, 65, 4580, 3042, 3279, 1486, 92],
 [3279,
  2198,
  2044,
  4601,
  1307,
  999,
  4924,
  4883,
  4360,
  3314,
  3219,
  3279,
  1486,
  92],
 [623, 2485, 784, 2936, 4657, 3012, 1083, 480, 602],
 [1330, 4431, 2663, 1383, 4435, 4806, 4185, 1985, 733],
 [4386, 2301, 2200, 1140, 1869, 2403, 3659, 3926, 327

### Embedding Layers

In [85]:
sent_length=20
embedded_docs_test=pad_sequences(onehot_repr_test,padding='post',maxlen=sent_length)
print(embedded_docs_test)

[[4486 1330   36 ...    0    0    0]
 [2015 4335  576 ...    0    0    0]
 [4380 4770  490 ...    0    0    0]
 ...
 [4729 2381 3077 ...    0    0    0]
 [2464 2711 3554 ...    0    0    0]
 [1383 4459 4348 ...    0    0    0]]


### Now Predicting The data

In [86]:
y_pred_test=model.predict(embedded_docs_test)

[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step


In [87]:
y_pred_test=np.where(y_pred_test > 0.6, 1,0) ##AUC ROC Curve

In [88]:
y_pred_test

array([[0],
       [1],
       [0],
       ...,
       [0],
       [0],
       [1]])

### Saving y_pred_test it as submit_test.csv

In [89]:
np.savetxt("Submit_test.csv", y_pred_test, delimiter=",")