## **Using the merged dataset to train the Logistic Model**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
merged_data = pd.read_csv('../data/clean/merged-labeled/final_dataset.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    merged_data['cleaned_text'], merged_data['sentiment'], test_size=0.3, random_state=42
)

In [4]:
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 8287
Testing set size: 3552


In [5]:
print(X_train.isnull().sum())  # Check for NaN values in the training set
print(X_test.isnull().sum())

2
0


In [6]:
X_train = X_train.dropna()
X_test = X_test.dropna()

In [7]:
print(X_train.isnull().sum())  # Check for NaN values in the training set
print(X_test.isnull().sum())

0
0


### **Use TF-IDF Vectorization to convert text into numerical features for model training**

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [9]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [10]:
print("TF-IDF matrix shape:", X_train_tfidf.shape)

TF-IDF matrix shape: (8285, 5000)


In [11]:
print(X_train_tfidf)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### **Compute GloVe Embeddings**

In [12]:
import numpy as np
from gensim.models import KeyedVectors

In [13]:
glove_file = "../data/glove/glove.6B.100d.txt"
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [14]:
def get_embedding(text):
    words = text.split()
    word_vectors = [glove_model[word] for word in words if word in glove_model]
    if word_vectors:
        return sum(word_vectors) / len(word_vectors)  # Average vector
    return np.zeros(100)  # Return a zero vector if no words are in GloVe

In [15]:
X_train_glove = np.array([get_embedding(text) for text in X_train])
X_test_glove = np.array([get_embedding(text) for text in X_test])

### **Normalize Features**

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
# Normalize TF-IDF features
scaler_tfidf = StandardScaler()
X_train_tfidf_scaled = scaler_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_scaled = scaler_tfidf.transform(X_test_tfidf)

In [18]:
# Normalize GloVe features
scaler_glove = StandardScaler()
X_train_glove_scaled = scaler_glove.fit_transform(X_train_glove)
X_test_glove_scaled = scaler_glove.transform(X_test_glove)

Combine Features with Weights

In [19]:
alpha = 0.7  # Weight for TF-IDF
beta = 0.3   # Weight for GloVe

In [20]:
X_train_combined = np.hstack([alpha * X_train_tfidf_scaled, beta * X_train_glove_scaled])
X_test_combined = np.hstack([alpha * X_test_tfidf_scaled, beta * X_test_glove_scaled])

### **Use SMOTE to oversample the minority classes in the training data.**

In [21]:
from imblearn.over_sampling import SMOTE

In [22]:
smote = SMOTE(random_state=42)

In [23]:
y_train_aligned = y_train[:X_train_combined.shape[0]]

In [24]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train_aligned)

In [25]:
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Class distribution after SMOTE:
sentiment
Positive    6825
Neutral     6825
Negative    6825
Name: count, dtype: int64


In [26]:
model = LogisticRegression()

In [None]:
model.fit(X_train_tfidf, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [8285, 8287]

: 

In [None]:
y_pred = model.predict(X_test_tfidf)

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

### **Fine-Tune the Logistic Regression Model**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'class_weight': ['balanced']
}


In [None]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring='f1_macro', cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

In [None]:
print("Best parameters:", grid_search.best_params_)

In [None]:
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_test_tfidf)

In [None]:
print("Tuned Classification Report:")
print(classification_report(y_test, y_pred_tuned))

# **Test**

In [None]:
def preprocess_input(text, vectorizer):
    # Transform the input text using the TF-IDF vectorizer
    return vectorizer.transform([text])  # Transform expects a list of strings


In [None]:
def predict_sentiment(text, model, vectorizer):
    # Preprocess the input text
    processed_text = preprocess_input(text, vectorizer)
    # Predict sentiment using the Logistic Regression model
    predicted_class = model.predict(processed_text)[0]
    return predicted_class


In [None]:
# X_train: Feature matrix (e.g., TF-IDF or embeddings)
# y_train: Target labels

# Check class distribution
from collections import Counter
print("Class distribution before oversampling:", Counter(y_train))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Use a maximum number of features to avoid memory issues

# Transform the text data
X_train_tfidf = vectorizer.fit_transform(X_train)


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the TF-IDF-transformed data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Check the class distribution after oversampling
print("Class distribution after SMOTE:", Counter(y_train_resampled))


In [None]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)


In [None]:
y_pred = model.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
import joblib

# Save the model
joblib.dump(model, '../models/logistic_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully.")


In [None]:
import joblib

# Load the trained Logistic Regression model
model = joblib.load('../models/logistic_model.pkl')

# Load the trained TF-IDF vectorizer
vectorizer = joblib.load('../models/tfidf_vectorizer.pkl')


In [None]:
class_labels = {'Negative': 'Negative', 'Neutral': 'Neutral', 'Positive': 'Positive'}



# Input string to test
input_text =  "decent"

# Predict sentiment
predicted_class = predict_sentiment(input_text, model, vectorizer)
print(f"Predicted Sentiment: {class_labels[predicted_class]}")


In [None]:

# Test the model
test_reviews = ["The bottle looks decent and weight wise also looks fine however there is a major design flaw in the bottle which makes it extremely vulnerable to cracks. The bottom base seems to be very loosely joint from the upper body and even if a very small fall is there, we see joints opening causing bottle to leak."]
test_vectors = vectorizer.transform(test_reviews)
predictions = model.predict(test_vectors)

print("Predictions:", predictions)  # Output should be labels (e.g., 0 for negative, 1 for positive)
