## **Using the merged dataset to train the Logistic Model**

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [16]:
merged_data = pd.read_csv('../data/clean/merged-labeled/merged_labeled_dataset.csv')

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    merged_data['cleaned_text'], merged_data['Sentiment'], test_size=0.3, random_state=42
)

In [18]:
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 5361
Testing set size: 2298


### **Use TF-IDF Vectorization to convert text into numerical features for model training**

In [19]:
vectorizer = TfidfVectorizer(max_features=5000)

In [20]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [21]:
print("TF-IDF matrix shape:", X_train_tfidf.shape)

TF-IDF matrix shape: (5361, 5000)


In [22]:
print(X_train_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 254699 stored elements and shape (5361, 5000)>
  Coords	Values
  (0, 487)	0.12507101479186025
  (0, 4423)	0.29708779096138177
  (0, 4497)	0.26655687536655615
  (0, 4914)	0.07739083357013853
  (0, 4410)	0.1180629774066455
  (0, 3833)	0.13668469644756756
  (0, 157)	0.10994206037371324
  (0, 969)	0.2168866737884808
  (0, 1687)	0.1829111969214513
  (0, 171)	0.1304572481811051
  (0, 405)	0.15853247669262374
  (0, 3314)	0.28856890167997534
  (0, 4403)	0.1323921472773326
  (0, 2300)	0.12338095201936983
  (0, 1233)	0.1391518628206721
  (0, 2337)	0.17804247128521675
  (0, 4878)	0.20017743941398908
  (0, 4895)	0.2447147257378281
  (0, 357)	0.10980936987746787
  (0, 4713)	0.12464224431838161
  (0, 2252)	0.12759938433910809
  (0, 1666)	0.13520223416412872
  (0, 568)	0.0927171011849157
  (0, 813)	0.19206527301652634
  (0, 2131)	0.07552573143213122
  :	:
  (5358, 3520)	0.09567541496434996
  (5358, 4907)	0.09462009282092182
  (5358, 4225)	

### **Use SMOTE to oversample the minority classes in the training data.**

In [23]:
from imblearn.over_sampling import SMOTE

In [24]:
smote = SMOTE(random_state=42)

In [25]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [26]:
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Class distribution after SMOTE:
Sentiment
Positive    4420
Neutral     4420
Negative    4420
Name: count, dtype: int64


In [27]:
model = LogisticRegression()

In [28]:
model.fit(X_train_tfidf, y_train)

In [29]:
y_pred = model.predict(X_test_tfidf)

In [30]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.79      0.45      0.57       148
     Neutral       0.65      0.13      0.21       219
    Positive       0.88      0.99      0.93      1931

    accuracy                           0.87      2298
   macro avg       0.77      0.52      0.57      2298
weighted avg       0.85      0.87      0.84      2298

Accuracy Score: 0.8746736292428199


### **Fine-Tune the Logistic Regression Model**

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'class_weight': ['balanced']
}


In [33]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring='f1_macro', cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

In [34]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'C': 10, 'class_weight': 'balanced', 'solver': 'lbfgs'}


In [35]:
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_test_tfidf)

In [36]:
print("Tuned Classification Report:")
print(classification_report(y_test, y_pred_tuned))

Tuned Classification Report:
              precision    recall  f1-score   support

    Negative       0.48      0.81      0.60       148
     Neutral       0.36      0.41      0.38       219
    Positive       0.94      0.88      0.91      1931

    accuracy                           0.83      2298
   macro avg       0.59      0.70      0.63      2298
weighted avg       0.86      0.83      0.84      2298



# **Test**

In [37]:
def preprocess_input(text, vectorizer):
    # Transform the input text using the TF-IDF vectorizer
    return vectorizer.transform([text])  # Transform expects a list of strings


In [38]:
def predict_sentiment(text, model, vectorizer):
    # Preprocess the input text
    processed_text = preprocess_input(text, vectorizer)
    # Predict sentiment using the Logistic Regression model
    predicted_class = model.predict(processed_text)[0]
    return predicted_class


In [39]:
# X_train: Feature matrix (e.g., TF-IDF or embeddings)
# y_train: Target labels

# Check class distribution
from collections import Counter
print("Class distribution before oversampling:", Counter(y_train))


Class distribution before oversampling: Counter({'Positive': 4420, 'Neutral': 480, 'Negative': 461})


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Use a maximum number of features to avoid memory issues

# Transform the text data
X_train_tfidf = vectorizer.fit_transform(X_train)


In [41]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the TF-IDF-transformed data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Check the class distribution after oversampling
print("Class distribution after SMOTE:", Counter(y_train_resampled))


Class distribution after SMOTE: Counter({'Positive': 4420, 'Neutral': 4420, 'Negative': 4420})


In [42]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)


In [43]:
y_pred = model.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.43      0.86      0.57       148
     Neutral       0.35      0.50      0.41       219
    Positive       0.96      0.84      0.90      1931

    accuracy                           0.81      2298
   macro avg       0.58      0.73      0.63      2298
weighted avg       0.87      0.81      0.83      2298



In [49]:
import joblib

# Save the model
joblib.dump(model, '../models/logistic_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [50]:
import joblib

# Load the trained Logistic Regression model
model = joblib.load('../models/logistic_model.pkl')

# Load the trained TF-IDF vectorizer
vectorizer = joblib.load('../models/tfidf_vectorizer.pkl')


In [63]:
class_labels = {'Negative': 'Negative', 'Neutral': 'Neutral', 'Positive': 'Positive'}



# Input string to test
input_text =  "decent"

# Predict sentiment
predicted_class = predict_sentiment(input_text, model, vectorizer)
print(f"Predicted Sentiment: {class_labels[predicted_class]}")


Predicted Sentiment: Negative


In [66]:

# Test the model
test_reviews = ["The product is great!", "Terrible quality, not worth the money."]
test_vectors = vectorizer.transform(test_reviews)
predictions = model.predict(test_vectors)

print("Predictions:", predictions)  # Output should be labels (e.g., 0 for negative, 1 for positive)


Predictions: ['Positive' 'Negative']
