In [5]:
import json
import pandas as pd

# Path to your JSON file
json_file_path = 'C:\\Users\\USER\\Dropbox\\PC\\Downloads\\reviews.json'

# Read JSON file
with open(json_file_path, 'r') as json_file:
    json_data = json.load(json_file)

# Convert JSON data to a Pandas DataFrame
df = pd.DataFrame(json_data)

# Display the DataFrame
print(df)


             Clothing ID Age  \
0          0         767  33   
1          1        1080  34   
2          2        1077  60   
3          3        1049  50   
4          4         847  47   
...      ...         ...  ..   
23481  23481        1104  34   
23482  23482         862  48   
23483  23483        1104  31   
23484  23484        1084  28   
23485  23485        1104  52   

                                                   Title  \
0                                                          
1                                                          
2                                Some major design flaws   
3                                       My favorite buy!   
4                                       Flattering shirt   
...                                                  ...   
23481                     Great dress for many occasions   
23482                         Wish it was made of cotton   
23483                              Cute, but see through   
23484  Very cut

In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


True

In [7]:
# Preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

# Apply preprocessing function to 'Review Text' column in the existing DataFrame 'df'
df['Review Text Cleaned'] = df['Review Text'].apply(preprocess_text)

# Display the DataFrame with the cleaned text
print(df[['Review Text', 'Review Text Cleaned']])

                                             Review Text  \
0      Absolutely wonderful - silky and sexy and comf...   
1      Love this dress!  it's sooo pretty.  i happene...   
2      I had such high hopes for this dress and reall...   
3      I love, love, love this jumpsuit. it's fun, fl...   
4      This shirt is very flattering to all due to th...   
...                                                  ...   
23481  I was very happy to snag this dress at such a ...   
23482  It reminds me of maternity clothes. soft, stre...   
23483  This fit well, but the top was very see throug...   
23484  I bought this dress for a wedding i have this ...   
23485  This dress in a lovely platinum is feminine an...   

                                     Review Text Cleaned  
0            absolutely wonderful silky sexy comfortable  
1      love dress sooo pretty happened find store im ...  
2      high hope dress really wanted work initially o...  
3      love love love jumpsuit fun flirty f

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Convert 'Rating' column to numeric values
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')  # Convert non-convertible values to NaN

# Drop rows with NaN values in 'Rating' column
df.dropna(subset=['Rating'], inplace=True)

# Data Labeling based on 'Rating' column
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x >= 3 else 'negative')

# Data Splitting
X = df['Review Text']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
# Bag of Words Vectorization
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model Selection and Training (using Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Model Evaluation
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.90

Classification Report:
              precision    recall  f1-score   support

    negative       0.53      0.37      0.43       470
    positive       0.93      0.96      0.95      4228

    accuracy                           0.90      4698
   macro avg       0.73      0.67      0.69      4698
weighted avg       0.89      0.90      0.90      4698



In [33]:
# Sample text review for testing
test_review = "This jumpsuit runs large so i sized down perfectly one full size down. i have a size b chest and was very comfortable with no bra, as the built in cups gave me the perfect coverage. it looked like a dress standing, but had such a flattering and fun shape when you move. everyone loved it!"

# Preprocess the input review
processed_review = preprocess_text(test_review)

# Vectorize the preprocessed review
processed_review_vec = vectorizer.transform([processed_review])

# Make predictions using the trained model
predicted_sentiment = model.predict(processed_review_vec)

# Print the predicted sentiment
print(f"Predicted Sentiment: {predicted_sentiment[0]}")


Predicted Sentiment: positive
