In [1]:
import pandas as pd

# Load the datasets
data1 = pd.read_csv('C:/Users/Vaibhav/Desktop/reviews_data_dump/reviews_badminton/data1.csv')
data2 = pd.read_csv('C:/Users/Vaibhav/Desktop/reviews_data_dump/reviews_tawa/data2.csv')
data3 = pd.read_csv('C:/Users/Vaibhav/Desktop/reviews_data_dump/reviews_tea/data3.csv')


data1.head(), data2.head(), data3.head()


(            Reviewer Name               Review Title  \
 0            Kamal Suresh               Nice product   
 1       Flipkart Customer     Don't waste your money   
 2  A. S. Raja Srinivasan   Did not meet expectations   
 3     Suresh Narayanasamy                       Fair   
 4               ASHIK P A                Over priced   
 
                Place of Review  Up Votes  Down Votes     Month  \
 0   Certified Buyer, Chirakkal     889.0        64.0  Feb 2021   
 1   Certified Buyer, Hyderabad     109.0         6.0  Feb 2021   
 2  Certified Buyer, Dharmapuri      42.0         3.0  Apr 2021   
 3     Certified Buyer, Chennai      25.0         1.0       NaN   
 4                          NaN     147.0        24.0  Apr 2016   
 
                                          Review text  Ratings  
 0  Nice product, good quality, but price is now r...        4  
 1  They didn't supplied Yonex Mavis 350. Outside ...        1  
 2  Worst product. Damaged shuttlecocks packed in ...    

In [2]:
# Standardize column names across all datasets
standard_columns = ['reviewer_name', 'review_title', 'review_text', 'review_rating', 'place_of_review', 'up_votes', 'down_votes', 'date_of_review']

# Rename columns to align with the standard we've defined
data1.columns = ['reviewer_name', 'review_title', 'place_of_review', 'up_votes', 'down_votes', 'date_of_review', 'review_text', 'review_rating']
data2.columns = standard_columns
data3.columns = standard_columns

# Merge all datasets into a single DataFrame
combined_data = pd.concat([data1, data2, data3], ignore_index=True)


In [3]:

combined_data.dropna(subset=['review_text', 'review_rating'], inplace=True)


In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Function for text cleaning and normalization
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Apply the preprocessing function to the review texts
combined_data['processed_review_text'] = combined_data['review_text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the processed review texts
tfidf_features = tfidf_vectorizer.fit_transform(combined_data['processed_review_text'])


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score


# Re-generate TF-IDF features AFTER cleaning
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(combined_data['processed_review_text'])

# Convert ratings into binary sentiment indicator (positive: 1, negative: 0)
combined_data['sentiment'] = combined_data['review_rating'].apply(lambda x: 1 if x > 3 else 0)

# Split the dataset into training and test sets AFTER TF-IDF features are generated
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, combined_data['sentiment'], test_size=0.2, random_state=42)

# Model Training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Model Evaluation
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
f1 = f1_score(y_test, predictions, average='weighted')
print(f"F1-Score: {f1}")


              precision    recall  f1-score   support

           0       0.85      0.42      0.56       337
           1       0.87      0.98      0.92      1365

    accuracy                           0.87      1702
   macro avg       0.86      0.70      0.74      1702
weighted avg       0.87      0.87      0.85      1702

F1-Score: 0.8516030539181589


In [11]:
from joblib import dump

# Save the trained model
dump(model, 'C:/Users/Vaibhav/Desktop/reviews_data_dump/logistic_regression_model.joblib')

# Save the TF-IDF vectorizer
dump(tfidf_vectorizer, 'C:/Users/Vaibhav/Desktop/reviews_data_dump/tfidf_vectorizer.joblib')


['C:/Users/Vaibhav/Desktop/reviews_data_dump/tfidf_vectorizer.joblib']