In [None]:
import zipfile
import os
import pandas as pd
# Define the path to  ZIP file
zip_file_paths = ['/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', '/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip']

# Define the extraction directory
extraction_dir = '/kaggle/working/extracted'

# Create the extraction directory if it doesn't exist
os.makedirs(extraction_dir, exist_ok=True)

# Extract the ZIP file
for i in zip_file_paths:
    with zipfile.ZipFile(i, 'r') as zip_ref:
        zip_ref.extractall(extraction_dir)

# List the contents of the extraction directory
extracted_files = os.listdir(extraction_dir)
print("Extracted files:", extracted_files)

# Read the data
train = pd.read_csv('/kaggle/working/extracted/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('/kaggle/working/extracted/testData.tsv', delimiter='\t')

In [None]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
# Function to clean the text
def clean_text(raw_text):
    # Remove HTML markup
    text_no_html = BeautifulSoup(raw_text, 'html.parser').get_text()
    
    # Remove non-letters, keep only letters
    letters_only = re.sub("[^a-zA-Z]", " ", text_no_html)
    
    # Convert to lowercase and split into words
    words = letters_only.lower().split()
    
    # Convert stopwords to a set for faster processing
    stops = set(stopwords.words("english"))
    
    # Remove stopwords
    meaningful_words = [w for w in words if not w in stops]
    
    # Join the words back into one string separated by space
    return " ".join(meaningful_words)

# Apply cleaning to the training set
train['clean_review'] = train['review'].apply(clean_text)

# Initialize the CountVectorizer
vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000, ngram_range=(1, 2))

# Fit and transform the vectorizer to the training data
train_data_features = vectorizer.fit_transform(train['clean_review'])

# Convert the result to an array
train_data_features = train_data_features.toarray()

# VotingClassifier with **Random Forest** and **XGBClassifier**

In [None]:

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier




# Initialize models
random_forest = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier()

# Create a Voting Classifier with RandomForest and XGBoost
voting_classifier = VotingClassifier(estimators=[('rf', random_forest), ('xgb', xgb)], voting='soft')

# Train the Voting Classifier on the training data
voting_classifier.fit(train_data_features, train['sentiment'])

# Apply the same cleaning to the test set
test['clean_review'] = test['review'].apply(clean_text)

# Transform the test data
test_data_features = vectorizer.transform(test['clean_review'])
test_data_features = test_data_features.toarray()

# Predict sentiment for the test set
result = voting_classifier.predict(test_data_features)

In [None]:
def predict_sentiment(sentence, model, vectorizer):
    cleaned_sentence = clean_text(sentence) 
    # Transform the cleaned sentence using the vectorizer
    sequence = vectorizer.transform([cleaned_sentence])
    prediction = model.predict(sequence)[0]
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment, sentence
# Function to visualize predictions
def visualize_predictions(sentences, model, vectorizer):
    for sentence in sentences:
        sentiment, original_sentence = predict_sentiment(sentence, model, vectorizer)
        print(f"Review: {original_sentence} | Predicted Sentiment: {sentiment}")
        print("--------------------------------------------------")

In [None]:
# Example sentences for visualization
example_sentences = [
    "This movie was fantastic, I loved it!",
    "This movie is a disaster,it is so awful that once you know what is coming,Look away and spend your time on more meaningful content.",
    "The performance was average, but the story was captivating."
]
# Visualize predictions for the example sentences
visualize_predictions(example_sentences, voting_classifier, vectorizer)

In [None]:

# Create a submission file (86% test accuracy)
submission = pd.DataFrame({'id': test['id'], 'sentiment': result})
submission.to_csv('submission.csv', index=False)

# **LSTM**

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D,Bidirectional
from sklearn.model_selection import train_test_split



# Tokenizing text and converting into sequences
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(train['clean_review'])
X = tokenizer.texts_to_sequences(train['clean_review'])
X = pad_sequences(X, maxlen=200)

# Splitting data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, train['sentiment'], test_size=0.2, random_state=42)

# Model architecture
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(Bidirectional(LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=1)

In [None]:
import matplotlib.pyplot as plt

# Visualize training and validation accuracy
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Visualize training and validation loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Preprocess the test data
X_test = tokenizer.texts_to_sequences(test['clean_review'])
X_test = pad_sequences(X_test, maxlen=200)

# Predict sentiment for the test set
result_dl = model.predict(X_test)

In [None]:
# Create a submission file (92% test accuracy)
submission = pd.DataFrame({'id': test['id'], 'sentiment': result_dl.flatten()})
submission.to_csv('submission.csv', index=False)