# Import Statements

In [None]:
import pandas as pd
import numpy as np
import re
import string
import joblib 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from gensim.models import Word2Vec
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Data Extration from Excel


In [None]:
df = pd.read_excel("final_data_2.xlsx")
print(df)

# Model Code

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize tokens
    return ' '.join(tokens)  # Rejoin tokens

# Apply preprocessing
df['Processed_Review'] = df['reviews.text'].apply(preprocess_text)

# TF-IDF with bigrams and trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
tfidf_features = tfidf_vectorizer.fit_transform(df['Processed_Review'])
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# Sentiment Analysis
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity  # Returns value between -1 (negative) and 1 (positive)

df['Sentiment'] = df['Processed_Review'].apply(get_sentiment)

# Word Count and Character Count
df['Word_Count'] = df['Processed_Review'].apply(lambda x: len(x.split()))
df['Char_Count'] = df['Processed_Review'].apply(len)

# Dimensionality Reduction
n_components = min(100, tfidf_features.shape[1])  # Adjust based on available features
svd = TruncatedSVD(n_components=n_components, random_state=42)
reduced_features = svd.fit_transform(tfidf_features)
joblib.dump(svd, "truncated_svd.pkl")
# Convert to DataFrame
reduced_df = pd.DataFrame(reduced_features, columns=[f'SVD_Component_{i+1}' for i in range(n_components)])

# Select Additional Features
df_select = df[['Sentiment', 'Word_Count', 'Char_Count']]
model_final_features = pd.concat([df_select, reduced_df], axis=1)

# Define features and target
X = model_final_features
y = df['Label']  # Ensure 'Label' column exists

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

scaler = MinMaxScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)  # Scale first!
joblib.dump(scaler, "scaler_MinMax.pkl")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_resampled)
joblib.dump(label_encoder,"label_encoder.pkl")

# Now apply feature selection
selector = SelectKBest(chi2, k=80)
X_selected = selector.fit_transform(X_resampled_scaled, y_encoded)
joblib.dump(selector,"Select_features.pkl")
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# # Apply feature selection after scaling
# selector = SelectKBest(chi2, k=80)  # Select top 80 features
# X_selected = selector.fit_transform(X_resampled_scaled, y_resampled)

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# # Standardization


# ?Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_model.fit(X_train, y_train)

# Predict using both models
y_pred_xgb = xgb_model.predict(X_test)


# ?Evaluate Models
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))

print("\nConfusion Matrix (XGB):")
print(confusion_matrix(y_test, y_pred_xgb))