<a href="https://colab.research.google.com/github/thatswhatmeetcoded/Sentiment-Classification/blob/main/decision_tree/3_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# feature_extraction.ipynb

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
import joblib
from scipy import sparse
import os

# Load cleaned data
data_path = 'clean_data.csv'
df = pd.read_csv(data_path)

# Create directory for features if it doesn’t exist
features_dir = 'features'
vectorizers_dir = 'vectorizers'
os.makedirs(features_dir, exist_ok=True)
os.makedirs(vectorizers_dir, exist_ok=True)

# Prepare labels
le = LabelEncoder()

y = le.fit_transform(df['sentiment'])
label_path = 'features/y.npy'
np.save(label_path, y)
joblib.dump(le, 'vectorizers/label_encoder.pkl')


# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])
joblib.dump(tfidf_vectorizer, f"{vectorizers_dir}/tfidf_vectorizer.pkl")

# Bag of Words Vectorization
bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_bow = bow_vectorizer.fit_transform(df['clean_text'])
joblib.dump(bow_vectorizer, f"{vectorizers_dir}/bow_vectorizer.pkl")

# Save the feature matrices (sparse format)
sparse.save_npz(f"{features_dir}/X_tfidf.npz", X_tfidf)
sparse.save_npz(f"{features_dir}/X_bow.npz", X_bow)

print(f"TF-IDF shape: {X_tfidf.shape}")
print(f"BoW shape: {X_bow.shape}")
print("Vectorizers and Features saved to Google Drive successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
