In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
import xgboost as xgb

# Load the dataset
url = 'https://raw.githubusercontent.com/USD-502-FinalProject/ADS599-CapstoneProject/main/reviews.xlsx'
data = pd.read_excel(url)

data.dropna(subset=['Age range', 'Review Text'], inplace=True)

# Function to check if the 'Review Text' contains specific words
def contains_word(text, words):
    for word in words:
        if word in text:
            return 1
    return 0

# Define the word lists for each column
word_lists = {
    'fatigue': ['tired', 'tire', 'fatigue'],
    'headaches': ['headache', 'headaches', 'head'],
    'joint pain': ['pain', 'joint', 'joint pain'],
    'dizzy': ['dizz', 'dizziness', 'dizzy', 'light', 'lightheadedness', 'lightheaded'],
    'nausea': ['nausea', 'naseous'],
    'hair loss': ['hair'],
    'impaired vision': ['vision'],
    'vomit': ['vomit'],
    'side effects yes': ['side effect', 'side effects']
}

# Create separate columns for each word list
for col, words in word_lists.items():
    data[col] = data['Review Text'].apply(lambda x: contains_word(x.lower(), words))

# Tokenization, removing stop words, and lemmatization
# (assuming you have NLTK library installed for stopwords and WordNetLemmatizer)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    return tokens


data['tokens'] = data['Review Text'].apply(preprocess_text)

# Define features and target
X = data['Review Text']
y = data[list(word_lists.keys())]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer(analyzer=preprocess_text)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Support Vector Machine
"""
svm = SVC(kernel='linear')
svm.fit(X_train_vec, y_train)
svm_pred = svm.predict(X_test_vec)
svm_f1 = f1_score(y_test, svm_pred, average='weighted')
print("SVM F1 Score:", svm_f1)
"""
# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train_vec, y_train)
xgb_pred = xgb_model.predict(X_test_vec)
xgb_f1 = f1_score(y_test, xgb_pred, average='weighted')
print("XGBoost F1 Score:", xgb_f1)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_vec, y_train)
rf_pred = rf.predict(X_test_vec)
rf_f1 = f1_score(y_test, rf_pred, average='weighted')
print("Random Forest F1 Score:", rf_f1)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_vec, y_train)
dt_pred = dt.predict(X_test_vec)
dt_f1 = f1_score(y_test, dt_pred, average='weighted')
print("Decision Tree F1 Score:", dt_f1)

# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train_vec, y_train)
knn_pred = knn.predict(X_test_vec)
knn_f1 = f1_score(y_test, knn_pred, average='weighted')
print("KNN F1 Score:", knn_f1)

# Determine the best model based on F1 scores
best_model = max(xgb_f1, dt_f1, knn_f1)
if best_model == xgb_f1:
    print("Best model: XGBoost")
if best_model == rf_f1:
    print("Best model: Random Forest")
elif best_model == dt_f1:
    print("Best model: Decision Tree")
else:
    print("Best model: K-Nearest Neighbors")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Connie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Connie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


XGBoost F1 Score: 0.730406746031746
Random Forest F1 Score: 0.4757173382173382
Decision Tree F1 Score: 0.6795176820077782
KNN F1 Score: 0.0
Best model: XGBoost
Best model: K-Nearest Neighbors
