In [23]:
import pandas as pd

# Read the .txt file where each line is a separate entry
df = pd.read_csv('training.txt', delimiter='\t', header=None, names=['Label', 'Text'])


# Display the DataFrame
print(df)


      Label  \
0         1   
1         1   
2         1   
3         1   
4         1   
...     ...   
6913      0   
6914      0   
6915      0   
6916      0   
6917      0   

                                                                                                                              Text  
0                                                                                          The Da Vinci Code book is just awesome.  
1     this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this.  
2                                                                                                 i liked the Da Vinci Code a lot.  
3                                                                                                 i liked the Da Vinci Code a lot.  
4                                                         I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own.  
...                  

In [24]:
pd.set_option('max_colwidth', 800)


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
count_vectorizer=CountVectorizer()
#now create dictionary
feature_ext=count_vectorizer.fit(df['Text'])

In [27]:
features=feature_ext.get_feature_names_out()

In [30]:
len(features)

2132

In [32]:
import random
random.sample(list(features),10)

['barnyard',
 '7th',
 '007',
 'suppose',
 'thank',
 'jesus',
 'empty',
 'done',
 'donkey',
 'finshed']

In [42]:
#creating count vectors from documents in dataset
train_df_features=count_vectorizer.transform(df['Text'])

In [36]:
train_df_features

<6918x2132 sparse matrix of type '<class 'numpy.int64'>'
	with 65398 stored elements in Compressed Sparse Row format>

In [43]:
#creating dataframe of all count vectors and assigning features or all unique words as columns header
train_ds=pd.DataFrame(train_df_features.todense())
train_ds.columns=features

In [38]:
df[0:1]

Unnamed: 0,Label,Text
0,1,The Da Vinci Code book is just awesome.


In [41]:
train_ds.iloc[0:1,150:157]

Unnamed: 0,away,awesome,awesomely,awesomeness,awesomest,awful,awkward
0,0,1,0,0,0,0,0


In [44]:
#removing low frequency words or features
import numpy as np
features_count=np.sum(train_df_features.toarray(),axis=0)

In [46]:
features_count=pd.DataFrame(dict(features=features,counts=features_count))

In [52]:
len(features_count[features_count.counts==1])

1228

In [54]:
count_vectorizer=CountVectorizer()
#now create dictionary
feature_ext=count_vectorizer.fit(df['Text'])
features=feature_ext.get_feature_names_out()
train_df_features=count_vectorizer.transform(df['Text'])
train_ds=pd.DataFrame(train_df_features.todense())
train_ds.columns=features

In [57]:
import numpy as np
features_count=np.sum(train_df_features.toarray(),axis=0)
features_count=pd.DataFrame(dict(features=features,counts=features_count))
features_count.sort_values('counts',ascending = False)[0:15]

Unnamed: 0,features,counts
1864,the,3306
93,and,2154
864,harry,2093
1466,potter,2093
355,code,2002
2009,vinci,2001
442,da,2001
1272,mountain,2000
259,brokeback,2000
1171,love,1624


In [79]:
#removing stopwords
from sklearn.feature_extraction import text 
my_stop_words=text.ENGLISH_STOP_WORDS
my_stop_words = list(text.ENGLISH_STOP_WORDS.union(['harry', 'potter',
'code', 'vinci', 'da','harry', 'mountain', 'movie', 'movies']))


In [80]:
count_vectorizer=CountVectorizer(stop_words=my_stop_words,max_features=1000)
#now create dictionary
feature_ext=count_vectorizer.fit(df['Text'])
features=feature_ext.get_feature_names_out()
train_df_features=count_vectorizer.transform(df['Text'])
train_ds=pd.DataFrame(train_df_features.todense())
train_ds.columns=features

In [81]:
feature_counts=np.sum(train_df_features.toarray(),axis=0)
feature_counts=pd.DataFrame(dict(features=features,counts=feature_counts))
feature_counts.sort_values('counts',ascending = False)[0:15]

Unnamed: 0,features,counts
73,brokeback,2000
408,love,1624
39,awesome,1127
436,mission,1094
341,impossible,1093
390,like,974
745,sucks,602
743,sucked,600
297,hate,578
652,really,374


In [85]:
#stemming and stopword removal
from nltk.stem.snowball import PorterStemmer
stemmer=PorterStemmer()
analyzer=CountVectorizer().build_analyzer()
def stemmed_word(doc):
    #stemming word
    stemmed_words=[stemmer.stem(w) for w in analyzer(doc)]
    # remove stopwords
    non_stop=[word for word in stemmed_words if word not in my_stop_words]
    return non_stop

In [92]:
count_vectorizer=CountVectorizer(analyzer=stemmed_word,max_features=1000)
#now create dictionary
feature_ext=count_vectorizer.fit(df['Text'])
features=feature_ext.get_feature_names_out()
train_df_features=count_vectorizer.transform(df['Text'])
train_ds=pd.DataFrame(train_df_features.todense())
train_ds.columns=features
train_ds['sentiment'] = df['Label']

In [93]:
feature_counts=np.sum(train_df_features.toarray(),axis=0)
feature_counts=pd.DataFrame(dict(features=features,counts=feature_counts))
feature_counts.sort_values('counts',ascending = False)[0:15]

Unnamed: 0,features,counts
303,harri,2093
83,brokeback,2000
414,love,1883
791,suck,1484
922,wa,1176
400,like,1155
447,movi,1149
45,awesom,1130
441,mission,1094
351,imposs,1093


In [94]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(train_df_features, train_ds.sentiment,
                                                test_size = 0.3,
                                                random_state = 42)

In [95]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit(train_X.toarray(), train_y)

In [96]:
test_ds_predicted = nb_clf.predict(test_X.toarray())

In [102]:
from sklearn import metrics
print(metrics.classification_report(test_y, test_ds_predicted))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       873
           1       0.98      0.99      0.98      1203

    accuracy                           0.98      2076
   macro avg       0.98      0.98      0.98      2076
weighted avg       0.98      0.98      0.98      2076



In [105]:
from sklearn.metrics import accuracy_score
print('accuracy:',accuracy_score(test_ds_predicted,test_y))



accuracy: 0.9802504816955684


In [119]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_X.toarray(), train_y)

In [121]:
test_rf_predicted = rf_classifier.predict(test_X.toarray())
print('accuracy:',accuracy_score(test_rf_predicted,test_y))

accuracy: 0.9903660886319846


In [122]:
print(metrics.classification_report(test_y, test_rf_predicted))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [2]:
import pandas as pd
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load training and testing data
train_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')

# Combine train and test texts for tokenization
all_texts = pd.concat([train_df['text'], test_df['text']], axis=0)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Pad sequences to ensure uniform input size
max_length = max([len(seq) for seq in train_sequences + test_sequences])
X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])

# Train Word2Vec on the combined texts
word2vec_model = Word2Vec(sentences=[text.split() for text in all_texts], vector_size=100, window=5, min_count=1, workers=4)
word2vec_weights = word2vec_model.wv.vectors
embedding_dim = word2vec_weights.shape[1]

# Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[word2vec_weights], input_length=max_length, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


ValueError: Layer embedding weight shape (16185, 100) is not compatible with provided weight shape (16184, 100).

In [3]:
import numpy as np

# Load training and testing data
train_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')

# Combine train and test texts for tokenization
all_texts = pd.concat([train_df['text'], test_df['text']], axis=0)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)
word_index = tokenizer.word_index

# Train Word2Vec on the combined texts
word2vec_model = Word2Vec(sentences=[text.split() for text in all_texts], vector_size=100, window=5, min_count=1, workers=4)

# Prepare the embedding matrix
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Pad sequences to ensure uniform input size
max_length = max([len(seq) for seq in train_sequences + test_sequences])
X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])

# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


Epoch 1/5


2024-09-04 20:15:03.059136: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-04 20:15:03.060374: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-04 20:15:03.060737: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-09-04 20:15:24.134669: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-04 20:15:24.135235: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-04 20:15:24.135902: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 1/63 [..............................] - ETA: 16s - loss: 1.5538 - accuracy: 0.2500

2024-09-04 20:16:49.937569: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-04 20:16:49.938101: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-04 20:16:49.938754: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Test Accuracy: 0.3475
