# Predict-Closed-Question-StackOverFlow Using LSTM

## Step-1:Import libraries

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences


## Step-2:Load the dataset 

In [10]:
df = pd.read_csv("C:\\Users\\Snigdha\\New folder\\Studies\\PROJECTS\\Predict closed questions on Stack Overflow\\train-sample.csv")

## Step-3:Separate the data into input (X) and target (y) variables:

In [11]:

X = df['BodyMarkdown'].values
y = df['OpenStatus'].values

## Step-4:Encode Labels using LabelEncoder

In [13]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
binary_labels = encoder.fit_transform(y)

## Step-5:Split the data into training & testing 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,binary_labels, test_size=0.3, random_state=42)


## Step-6:Tokenize the input data using the Keras Tokenizer:

In [15]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index


## Step-7:Pad the input sequences to ensure they all have the same length:

In [16]:
maxlen = 100  # Max sequence length

X_train = pad_sequences(X_train, padding="post", maxlen=maxlen)
X_test = pad_sequences(X_test, padding="post", maxlen=maxlen)


## Step-8: LSTM model

In [17]:
embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])


In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 50)           15342600  
                                                                 
 lstm_1 (LSTM)               (None, 64)                29440     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 15,372,105
Trainable params: 15,372,105
Non-trainable params: 0
_________________________________________________________________


## Step-9:Train the model

In [18]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bd08940f70>

## Step-10:Evaluate the model 

In [19]:
loss, accuracy = model.evaluate(X_test, y_test)

print("Test accuracy:", accuracy)


Test accuracy: 0.11071241647005081


## Step-11:Predictions on new data

In [20]:
new_question = "How can I solve this error in my code?"
new_question_seq = tokenizer.texts_to_sequences([new_question])
new_question_padded = pad_sequences(new_question_seq, padding="post", maxlen=maxlen)

prediction = model.predict(new_question_padded)

if prediction >= 0.5:
    print("The question is likely to be closed.")
else:
    print("The question is likely to be open.")


The question is likely to be closed.


# Accuaracy score : 11.07%