<a href="https://colab.research.google.com/github/SeanMuInCa/learn_python/blob/master/NLP_classwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


nlp = spacy.load("en_core_web_sm")

# Function for text preprocessing using spaCy because also use spacy in my capstone project
def preprocess_text_spacy(text):
    doc = nlp(text.lower())  # Convert to lowercase
    words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]  # Lemmatization remove stop words
    return " ".join(words)

# Load dataset part of data, too large
df = pd.read_csv("Ecommerce_data.csv", encoding="ISO-8859-1",nrows=1000)
df["processed_text"] = df["Text"].apply(preprocess_text_spacy)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=500)  # Extract top 500 most important words
X_text = vectorizer.fit_transform(df["processed_text"]).toarray()
y = df["label"]  # Target variable

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy after spaCy preprocessing: {accuracy:.4f}")

In [None]:
# Convert raw text into TF-IDF features (without preprocessing)
vectorizer = TfidfVectorizer(max_features=500)
raw = pd.read_csv("Ecommerce_data.csv", encoding="ISO-8859-1",nrows=1000)
X_raw = vectorizer.fit_transform(raw["Text"]).toarray()  # Use raw text directly

# Split dataset
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_raw, y, test_size=0.2, random_state=42)

# Train Random Forest on raw text
model_raw = RandomForestClassifier(n_estimators=100, random_state=42)
model_raw.fit(X_train_raw, y_train_raw)

# Predict and evaluate accuracy for raw data
y_pred_raw = model_raw.predict(X_test_raw)
accuracy_raw = accuracy_score(y_test_raw, y_pred_raw)

# Print accuracy comparison
print(f"Raw text model accuracy: {accuracy_raw:.4f}")
print(f"Preprocessed text model accuracy: {accuracy:.4f}")

# Calculate improvement in accuracy
accuracy_improvement = (accuracy - accuracy_raw) / accuracy_raw * 100
print(f"Accuracy improvement: {accuracy_improvement:.2f}%")