In [4]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

# Function to preprocess and vectorize data
def preprocess_and_vectorize(X_train, X_test, vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec

# Function to get Word2Vec features
def get_w2v_features(tokens, model, vector_size):
    features = []
    for sentence in tokens:
        vec = np.zeros(vector_size)
        count = 0
        for word in sentence:
            if word in model.wv.key_to_index:
                vec += model.wv[word]
                count += 1
        if count > 0:
            vec /= count
        features.append(vec)
    return np.array(features)

# Function to evaluate model
def evaluate_model(model, X_test_vec, y_test_enc):
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test_enc, y_pred)
    print(f"Accuracy: {accuracy:.2f}\n")
    print("Classification Report:\n", classification_report(y_test_enc, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test_enc, y_pred))
    print("-" * 50)
    return accuracy

# Step 1: Create Dummy Data
data = {
    "text": [
        "I love this product, it's amazing!",
        "This is the worst experience I've ever had.",
        "Absolutely fantastic! Highly recommend.",
        "Not great, would not buy again.",
        "The quality is superb, really satisfied.",
        "Terrible, broke after one use.",
        "Decent product for the price.",
        "Awful customer service, very disappointed.",
        "Excellent value for money, very happy.",
        "It's okay, nothing special but works fine."
    ],
    "sentiment": ["positive", "negative", "positive", "negative", "positive",
                  "negative", "neutral", "negative", "positive", "neutral"]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 2: Preprocessing
X = df['text']
y = df['sentiment']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encoding labels for consistency
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))  # Using unigrams and bigrams
X_train_tfidf, X_test_tfidf = preprocess_and_vectorize(X_train, X_test, tfidf_vectorizer)

# Bag of Words (BoW) Vectorization
bow_vectorizer = CountVectorizer(max_features=500, ngram_range=(1, 2))
X_train_bow, X_test_bow = preprocess_and_vectorize(X_train, X_test, bow_vectorizer)

# Word2Vec Vectorization
X_train_tokens = [sentence.split() for sentence in X_train]
X_test_tokens = [sentence.split() for sentence in X_test]

w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4, sg=1)
X_train_w2v = get_w2v_features(X_train_tokens, w2v_model, vector_size=100)
X_test_w2v = get_w2v_features(X_test_tokens, w2v_model, vector_size=100)

# Step 3: Model Training and Evaluation
methods = {
    "TF-IDF": (X_train_tfidf, X_test_tfidf),
    "Bag of Words": (X_train_bow, X_test_bow),
    "Word2Vec": (X_train_w2v, X_test_w2v)
}

models = {}
results = {}

for method, (X_train_vec, X_test_vec) in methods.items():
    print(f"Evaluating with {method}...")
    model = LogisticRegression(max_iter=200)
    model.fit(X_train_vec, y_train_enc)
    accuracy = evaluate_model(model, X_test_vec, y_test_enc)
    results[method] = accuracy
    models[method] = model

# Compare Results
print("\nComparison of Accuracy:")
for method, accuracy in results.items():
    print(f"{method}: {accuracy:.2f}")

# Step 4: Predict Sentiment for User Input
print("\nEnter sentences to predict their sentiment:")
while True:
    sentence = input("Enter a sentence (or type 'exit' to quit): ")
    if sentence.lower() == 'exit':
        break

    sentence_vecs = {
        "TF-IDF": tfidf_vectorizer.transform([sentence]),
        "Bag of Words": bow_vectorizer.transform([sentence]),
        "Word2Vec": get_w2v_features([sentence.split()], w2v_model, vector_size=100)
    }

    print("Predictions:")
    for method, vec in sentence_vecs.items():
        pred_label = models[method].predict(vec)
        sentiment = label_encoder.inverse_transform(pred_label)
        print(f"{method}: {sentiment[0]}")
    print("-" * 50)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject