# Importing libraries

In [None]:
import nltk
import pandas as pd
import re
import string
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Loading and preprocessing dataset

In [19]:

# Load the dataset
df = pd.read_json("C:/project-II/NLP task/archive/Cell_Phones_and_Accessories_5.json", lines = True)
print(df.columns)
# Display the first few rows
df.head()

df = df[['reviewText', 'overall']].dropna()

#df = pd.DataFrame(documents, columns=['review', 'sentiment'])

# Convert sentiment labels to numerical values
df['sentiment'] = df['overall'].apply(lambda x: 1 if x >= 3 else 0)

# Rename 'reviewText' to 'review' for consistency
df.rename(columns={'reviewText': 'review'}, inplace=True)

# Text preprocessing function
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()  # Tokenize
    stop_words = set(stopwords.words('english'))  # Get stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

#nltk.download('stopwords')

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

df[['review', 'cleaned_review']].head()


Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')


Unnamed: 0,review,cleaned_review
0,They look good and stick good! I just don't li...,look good stick good dont like rounded shape a...
1,These stickers work like the review says they ...,stickers work like review says stick great sta...
2,These are awesome and make my phone look so st...,awesome make phone look stylish used one far a...
3,Item arrived in great time and was in perfect ...,item arrived great time perfect condition howe...
4,"awesome! stays on, and looks great. can be use...",awesome stays looks great used multiple apple ...


# Feature Extraction

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency

# Convert text data into TF-IDF features
X = tfidf_vectorizer.fit_transform(df['cleaned_review'])

# Convert labels to NumPy array
y = df['sentiment'].values

# Print shape of TF-IDF matrix
print("TF-IDF Feature Matrix Shape:", X.shape)


TF-IDF Feature Matrix Shape: (194439, 5000)


# Train test splitting

In [21]:
from sklearn.model_selection import train_test_split

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print sizes of train and test sets
print(f"Training data: {X_train.shape}, Testing data: {X_test.shape}")


Training data: (155551, 5000), Testing data: (38888, 5000)


# Implementing logistic regression model

In [22]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)


# Metrices calculation

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9157
Precision: 0.9274
Recall: 0.9803

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.47      0.58      4890
           1       0.93      0.98      0.95     33998

    accuracy                           0.92     38888
   macro avg       0.85      0.72      0.77     38888
weighted avg       0.91      0.92      0.91     38888



# Tuning hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters
param_grid = {'C': [0.1, 1, 10, 100]}

# Run Grid Search
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

# Train best model
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)

print("Tuned Logistic Regression Performance:")
print(classification_report(y_test, y_pred_best))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Tuned Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.72      0.51      0.60      4890
           1       0.93      0.97      0.95     33998

    accuracy                           0.91     38888
   macro avg       0.83      0.74      0.78     38888
weighted avg       0.91      0.91      0.91     38888



# testing with curated reviews

In [25]:
curated_reviews = [
    "The mobile phone was fantastic, I loved it!",  # Positive
    "waste of money,  waste of time.",  # Negative
    "Not bad, but could have been better.",  # Neutral
]

curated_labels = [1, 0, 0]  # 1=positive, 0=negative

# Preprocess and extract features
curated_reviews_cleaned = [preprocess_text(review) for review in curated_reviews]
curated_reviews_tfidf = tfidf_vectorizer.transform(curated_reviews_cleaned)

# Predictions
curated_predictions = model.predict(curated_reviews_tfidf)
print(f"Curated Sample Predictions: {curated_predictions}")


Curated Sample Predictions: [1 0 0]
