<a href="https://colab.research.google.com/github/Srishtijais16/step_demo/blob/day2/SPAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# Load the dataset
file_path = '/mnt/data/all_kindle_review (1).csv'
data = pd.read_csv('all_kindle_review (1).csv')

# Derive Sentiment (Target Variable) from Rating
data['Sentiment'] = data['rating'].apply(lambda x: 1 if x > 3 else 0)

# Text Preprocessing
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

data['Cleaned_Review'] = data['reviewText'].apply(preprocess_text)

# TF-IDF Feature Extraction
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data['Cleaned_Review'].dropna()).toarray()
y = data['Sentiment']
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Word2Vec Feature Extraction
data['Tokenized_Review'] = data['Cleaned_Review'].apply(lambda x: x.split())
w2v_model = Word2Vec(sentences=data['Tokenized_Review'].dropna(), vector_size=100, window=5, min_count=1, workers=4)

def word2vec_features(tokenized_review):
    feature_vec = np.zeros(100)
    num_words = 0
    for word in tokenized_review:
        if word in w2v_model.wv:
            feature_vec += w2v_model.wv[word]
            num_words += 1
    return feature_vec / num_words if num_words > 0 else feature_vec

X_w2v = np.array(data['Tokenized_Review'].dropna().apply(word2vec_features).tolist())
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

# Model Training and Evaluation
def train_evaluate_model(X_train, X_test, y_train, y_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Results for TF-IDF
print("TF-IDF Results:")
acc, prec, rec, f1 = train_evaluate_model(X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf)
print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")

# Results for Word2Vec
print("\nWord2Vec Results:")
acc, prec, rec, f1 = train_evaluate_model(X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v)
print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TF-IDF Results:
Accuracy: 0.8304, Precision: 0.8184, Recall: 0.8529, F1-Score: 0.8353

Word2Vec Results:
Accuracy: 0.7712, Precision: 0.7775, Recall: 0.7653, F1-Score: 0.7713
