In [None]:
# feature_engineering.py
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import spacy
import gensim
import pandas as pd

# Load SpaCy model once globally
try:
  nlp = spacy.load('en_core_web_lg')  # For word vectors
except OSError:
    print("Downloading SpaCy language model 'en_core_web_lg'...")
    spacy.cli.download("en_core_web_lg")
    nlp = spacy.load('en_core_web_lg')
def create_tfidf_features(texts, max_features=5000, ngram_range=(1, 1)):
    """Creates TF-IDF features."""
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer

def create_average_word_embeddings(texts, embedding_dim=300):
    """Creates average word embedding features."""
    all_vectors = []
    for text in texts:
        doc = nlp(text)
        vectors = [token.vector for token in doc if not token.is_stop and token.has_vector]
        if vectors:
             average_vector = np.mean(vectors, axis=0)
             all_vectors.append(average_vector)
        else:
            all_vectors.append(np.zeros(embedding_dim))  # or handle differently
    return np.array(all_vectors)

def create_word2vec_features(texts, embedding_dim=300):
    """Creates word2vec features."""
    # Create sentences for training
    sentences = [text.split() for text in texts]

    # Train Word2Vec model
    model = gensim.models.Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4)
    model.train(sentences, total_examples=len(sentences), epochs=10)

    # Generate features
    all_vectors = []
    for text in texts:
         doc = [token for token in text.split() ]
         vectors = [model.wv[token] for token in doc if token in model.wv]
         if vectors:
             average_vector = np.mean(vectors, axis=0)
             all_vectors.append(average_vector)
         else:
            all_vectors.append(np.zeros(embedding_dim))

    return np.array(all_vectors), model

def split_data(features, labels, test_size=0.2, random_state=42):
    """Splits the data into training and testing sets."""
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def main():
   try:
        data = pd.read_csv("reviews.csv") # Assuming a 'text' column and 'sentiment' column
        data.dropna(subset=["text", "sentiment"], inplace=True)
        from data_preprocessing import preprocess_dataframe
        processed_df = preprocess_dataframe(data, "text")

        texts = processed_df["processed_text"].tolist()
        labels = processed_df["sentiment"].tolist()

        # TF-IDF features
        tfidf_matrix, tfidf_vectorizer = create_tfidf_features(texts)
        print("TF-IDF matrix shape:", tfidf_matrix.shape)

         # Split data for TF-IDF
        X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = split_data(tfidf_matrix, labels)
        print("TF-IDF Train Data Shape:", X_train_tfidf.shape)
        print("TF-IDF Test Data Shape:", X_test_tfidf.shape)

        # Average Word Embeddings
        average_embeddings = create_average_word_embeddings(texts)
        print("Average Word Embeddings shape:", average_embeddings.shape)

        # Split data for Average Word Embeddings
        X_train_avg_emb, X_test_avg_emb, y_train_avg_emb, y_test_avg_emb = split_data(average_embeddings, labels)
        print("Average Embeddings Train Data Shape:", X_train_avg_emb.shape)
        print("Average Embeddings Test Data Shape:", X_test_avg_emb.shape)

        # Word2Vec features
        word2vec_embeddings, word2vec_model = create_word2vec_features(texts)
        print("Word2Vec embeddings shape:", word2vec_embeddings.shape)

        # Split data for Word2Vec
        X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = split_data(word2vec_embeddings, labels)
        print("Word2Vec Train Data Shape:", X_train_w2v.shape)
        print("Word2Vec Test Data Shape:", X_test_w2v.shape)
   except FileNotFoundError:
        print("Error: 'reviews.csv' not found. Please provide the data in a 'reviews.csv' file with 'text' and 'sentiment' columns.")
   except KeyError as e:
        print(f"Error: {e} column not found. Ensure your 'reviews.csv' has 'text' and 'sentiment' columns.")
   except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == '__main__':
   main()