In [4]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the CSV file
df = pd.read_csv('data.csv', header=None, names=['id','text', 'type'], skiprows=1)

# nltk.download('stopwords')
# nltk.download('punkt')

In [5]:
# Define a function to clean the text
def clean_text(text):
    text = re.sub(r'\d+', '', text)  # remove digits
    text = text.translate(str.maketrans('', '', string.punctuation.replace('?', '')))  # remove punctuation except '?'
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespaces

    # filter out the stopwords except for interrogative words
    stop_words = set(stopwords.words('english')) - set(['what', 'where', 'when', 'why', 'how', 'which', 'who'])
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_text)

# Apply the function to the text column
df['text'] = df['text'].apply(clean_text)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import numpy as np

# Apply TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['text'])

X_tfidf

<235110x95631 sparse matrix of type '<class 'numpy.float64'>'
	with 2055208 stored elements in Compressed Sparse Row format>

In [None]:

# Apply LSA on TF-IDF vectors
lsa = TruncatedSVD(n_components=100)
X_lsa = lsa.fit_transform(X_tfidf)



KeyboardInterrupt: 

In [None]:
# Apply LDA on TF-IDF vectors
lda = LatentDirichletAllocation(n_components=10, n_jobs = -1)
X_lda = lda.fit_transform(X_tfidf)



In [6]:
embeddings_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

def sentence_vector(sentence):
    words = sentence.split()
    word_vectors = [embeddings_index.get(word, np.zeros((100,))) for word in words]
    return np.mean(word_vectors, axis=0)

df['vector'] = df['text'].apply(sentence_vector)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [9]:
df

Unnamed: 0,id,text,type,vector
0,0,beyoncé giselle knowlescarter biːˈjɒnseɪ beeyo...,sentence,"[0.277592616299024, -0.09187085124162528, 0.23..."
1,1,born raised houston texas performed various si...,sentence,"[0.2768719513949595, 0.18281278406318865, 0.03..."
2,2,managed father mathew knowles group became one...,sentence,"[0.103830256, 0.11095869, 0.09941408, -0.24226..."
3,3,hiatus saw release beyoncés debut album danger...,sentence,"[0.11885730266714326, 0.11122653769472471, 0.2..."
4,4,following disbandment destinys child june rele...,sentence,"[0.06178458692092034, -0.042437670183264546, 0..."
...,...,...,...,...
235105,10562,hi teensuser,sentence,"[0.0722000002861023, 0.11989499628543854, 0.48..."
235106,10563,join,sentence,"[-0.14029, 0.41191, -0.26948, -0.43433, 0.2888..."
235107,10564,hi teensuser,sentence,"[0.0722000002861023, 0.11989499628543854, 0.48..."
235108,10565,know teensuser,sentence,"[0.0953650027513504, 0.28431999683380127, 0.36..."


In [12]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# Prepare the data for Doc2Vec
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['text'].apply(lambda x: x.split(' ')))]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=100, window=2, min_count=1)

# Generate sentence vectors
df['doc2vec_vector'] = df['text'].apply(lambda x: model.infer_vector(x.split(' ')))

In [9]:
df

Unnamed: 0,id,text,type,vector,doc2vec_vector
0,0,beyoncé giselle knowlescarter biːˈjɒnseɪ beeyo...,sentence,"[0.277592616299024, -0.09187085124162528, 0.23...","[0.070307955, 0.10135071, 0.07622304, -0.12226..."
1,1,born raised houston texas performed various si...,sentence,"[0.2768719513949595, 0.18281278406318865, 0.03...","[0.21651179, -0.07581898, 0.19439001, 0.043524..."
2,2,managed father mathew knowles group became one...,sentence,"[0.103830256, 0.11095869, 0.09941408, -0.24226...","[0.15767485, -0.055452455, 0.17544344, -0.0274..."
3,3,hiatus saw release beyoncés debut album danger...,sentence,"[0.11885730266714326, 0.11122653769472471, 0.2...","[0.14156717, 0.038045965, 0.23169103, -0.05583..."
4,4,following disbandment destinys child june rele...,sentence,"[0.06178458692092034, -0.042437670183264546, 0...","[0.10060528, 0.13067545, 0.004151089, -0.02187..."
...,...,...,...,...,...
235105,10562,hi teensuser,sentence,"[0.0722000002861023, 0.11989499628543854, 0.48...","[0.0023324566, 0.0049042483, 0.0011296303, -0...."
235106,10563,join,sentence,"[-0.14029, 0.41191, -0.26948, -0.43433, 0.2888...","[0.044683177, 0.012490676, -0.006311598, 0.007..."
235107,10564,hi teensuser,sentence,"[0.0722000002861023, 0.11989499628543854, 0.48...","[-0.0029709933, 0.0066249697, -0.006486339, -0..."
235108,10565,know teensuser,sentence,"[0.0953650027513504, 0.28431999683380127, 0.36...","[-0.004434658, 0.013921657, 0.0015257725, -0.0..."


In [13]:
from sklearn.model_selection import train_test_split

# train_df, test_df = train_test_split(df, random_state=42)

In [11]:
# train_df

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Convert the 'type' column to binary (1 for question, 0 for statement)
df['type'] = df['type'].apply(lambda x: 1 if x == 'question' else 0)

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['doc2vec_vector'].tolist(), df['type'], test_size=0.2, random_state=42)

# Train a logistic regression model
clf = LogisticRegression(penalty='l2', random_state=42, n_jobs=-1).fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80     20604
           1       0.83      0.88      0.85     26418

    accuracy                           0.83     47022
   macro avg       0.83      0.82      0.83     47022
weighted avg       0.83      0.83      0.83     47022



In [19]:
from sklearn.ensemble import RandomForestClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['type'], test_size=0.2, random_state=42)

# Train a Random Forest model
clf = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

KeyboardInterrupt: 

In [33]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.regularizers import l2

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
data = pad_sequences(sequences, maxlen=100)

# Prepare the labels
le = LabelEncoder()
labels = le.fit_transform(df['type'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=6, batch_size=128)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
model.save("helloworld.keras")
print('Test Loss: {}'.format(loss))
print('Test Accuracy: {}'.format(accuracy))

Epoch 1/6
[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 97ms/step - accuracy: 0.8574 - loss: 0.3124 - val_accuracy: 0.9555 - val_loss: 0.1264
Epoch 2/6
[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 96ms/step - accuracy: 0.9600 - loss: 0.1159 - val_accuracy: 0.9686 - val_loss: 0.0949
Epoch 3/6
[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 95ms/step - accuracy: 0.9692 - loss: 0.0917 - val_accuracy: 0.9712 - val_loss: 0.0884
Epoch 4/6
[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 99ms/step - accuracy: 0.9727 - loss: 0.0825 - val_accuracy: 0.9748 - val_loss: 0.0800
Epoch 5/6
[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 98ms/step - accuracy: 0.9755 - loss: 0.0772 - val_accuracy: 0.9757 - val_loss: 0.0785
Epoch 6/6
[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 102ms/step - accuracy: 0.9770 - loss: 0.0737 - val_accuracy: 0.9766 - val_loss: 0.0745
[1

In [22]:
from keras.layers import Bidirectional, Dropout

# Build the Bidirectional LSTM model with L2 regularization
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100, trainable=False))
model.add(Bidirectional(LSTM(128, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss: {}'.format(loss))
print('Test Accuracy: {}'.format(accuracy))

Epoch 1/10
[1m 151/1470[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m2:15[0m 103ms/step - accuracy: 0.6130 - loss: 5.5495

KeyboardInterrupt: 