In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
import warnings
import nltk
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

warnings.filterwarnings("ignore")
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

df = pd.read_csv('37000_reviews_of_thread_app.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mspur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mspur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Function to handle negations
def handle_negations(text):
    negations = {"not", "no", "never", "n't", "cannot", "neither", "nor"}
    words = text.split()
    negated_text = []
    negate = False
    for word in words:
        if negate and word.isalpha():
            negated_text.append(f"not_{word}")
        else:
            negated_text.append(word)
        if word in negations or word.endswith("n't"):
            negate = True
        if word.endswith(('.', '!', '?')):
            negate = False
    return ' '.join(negated_text)

# Enhanced Preprocessing Function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Handle negations
    text = handle_negations(text)

    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])

    # Tokenization and Lemmatization (not using Spacy)
    words = text.split()
    lemmatized = [word for word in words]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    without_stopwords = [word for word in lemmatized if word not in stop_words]

    # Rejoin words
    return ' '.join(without_stopwords)

# Apply the preprocessing to each review
df['preprocessed_reviews'] = df['review_description'].apply(preprocess_text)

In [3]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the preprocessed text data into TF-IDF vectors
X = tfidf_vectorizer.fit_transform(df['preprocessed_reviews'])

from sklearn.preprocessing import OneHotEncoder

# One-hot encode the 'source' column
onehot_encoder = OneHotEncoder()
source_encoded = onehot_encoder.fit_transform(df[['source']])

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the preprocessed text data into TF-IDF vectors
X_text = tfidf_vectorizer.fit_transform(df['preprocessed_reviews'])

# Combine the TF-IDF vectors with the one-hot encoded 'source' column
X = hstack((X_text, source_encoded))

# Assuming your target variable is in df['rating']
y = df['rating']

# Map numerical ratings to textual labels if that's not already done
df['rating_label'] = df['rating'].map({1: 'Low', 2: 'Low', 3: 'Medium', 4: 'High', 5: 'High'})

# Update y to use these new textual labels
y = df['rating_label']


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [5]:
# Initialize and fit the Decision Tree Classifier model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)

print("Decision Tree Metrics:")
print("Accuracy:", accuracy_dt)

class_report_dt = classification_report(y_test, y_pred_dt)
print("Decision Tree Classification Report:\n", class_report_dt)

Decision Tree Metrics:
Accuracy: 0.7266985473247316
Decision Tree Classification Report:
               precision    recall  f1-score   support

        High       0.77      0.86      0.81      6104
         Low       0.73      0.66      0.70      4073
      Medium       0.20      0.12      0.15       906

    accuracy                           0.73     11083
   macro avg       0.57      0.55      0.55     11083
weighted avg       0.71      0.73      0.71     11083



In [6]:
# Initialize and fit the Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)


accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("Accuracy:", accuracy_rf)
class_report = classification_report(y_test, y_pred_rf)
print("Classification Report:\n", class_report)


Accuracy: 0.7756925020301363
Classification Report:
               precision    recall  f1-score   support

        High       0.80      0.90      0.84      6104
         Low       0.75      0.76      0.76      4073
      Medium       0.30      0.02      0.04       906

    accuracy                           0.78     11083
   macro avg       0.61      0.56      0.54     11083
weighted avg       0.74      0.78      0.75     11083



In [7]:
# Initialize and fit the Multinomial Naive Bayes (MultinomialNB) model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


y_pred_nb = nb_model.predict(X_test)


accuracy_nb = accuracy_score(y_test, y_pred_nb)

print("\nMultinomial Naive Bayes Metrics:")
print("Accuracy:", accuracy_nb)


class_report_nb = classification_report(y_test, y_pred_nb)
print("Multinomial Naive Bayes Classification Report:\n", class_report_nb)


Multinomial Naive Bayes Metrics:
Accuracy: 0.7757827303076784
Multinomial Naive Bayes Classification Report:
               precision    recall  f1-score   support

        High       0.79      0.91      0.84      6104
         Low       0.78      0.73      0.75      4073
      Medium       0.36      0.08      0.13       906

    accuracy                           0.78     11083
   macro avg       0.64      0.57      0.58     11083
weighted avg       0.75      0.78      0.75     11083



In [8]:
# Initialize and fit the Linear Support Vector Classifier (LinearSVC) model
svc_model = LinearSVC(random_state=42)
svc_model.fit(X_train, y_train)

y_pred_svc = svc_model.predict(X_test)


accuracy_svc = accuracy_score(y_test, y_pred_svc)
print("\nLinear Support Vector Classifier (LinearSVC) Metrics:")
print("Accuracy:", accuracy_svc)

class_report_svc = classification_report(y_test, y_pred_svc)
print("Linear Support Vector Classifier (LinearSVC) Classification Report:\n", class_report_svc)


Linear Support Vector Classifier (LinearSVC) Metrics:
Accuracy: 0.7798430027970766
Linear Support Vector Classifier (LinearSVC) Classification Report:
               precision    recall  f1-score   support

        High       0.78      0.92      0.85      6104
         Low       0.79      0.73      0.76      4073
      Medium       0.34      0.05      0.08       906

    accuracy                           0.78     11083
   macro avg       0.64      0.57      0.56     11083
weighted avg       0.75      0.78      0.75     11083

