In [None]:
import json
import pandas as pd
import nltk
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report

In [None]:
data_file = open("/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json")
data = []
data_cnt = 0
for line in data_file:
    data.append(json.loads(line))
    data_cnt += 1
    if(data_cnt>50000):
        break
review_df = pd.DataFrame(data)
data_file.close()

stratify_column = 'stars'

# Create a DataFrame from the 'data' list

# Initialize StratifiedShuffleSplit
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Perform the split
for train_index, test_index in stratified_split.split(review_df, review_df[stratify_column]):
    stratified_train_df = review_df.loc[train_index]
    stratified_test_df = review_df.loc[test_index]
print(stratified_test_df.shape)

In [None]:
# Display the first few rows of train and test subsets
print("Train Subset:")
print(stratified_train_df.head())

print("Test Subset:")
print(stratified_test_df.head())

# Check the shape of train and test subsets
print("Train Subset Shape:", stratified_train_df.shape)
print("Test Subset Shape:", stratified_test_df.shape)

# Summary statistics of the 'stars' column in train and test subsets
print("Train Subset Statistics:")
print(stratified_train_df['stars'].describe())

print("Test Subset Statistics:")
print(stratified_test_df['stars'].describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of 'stars' in train and test subsets
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.countplot(x='stars', data=stratified_train_df)
plt.title('Train Subset - Distribution of Stars')

plt.subplot(1, 2, 2)
sns.countplot(x='stars', data=stratified_test_df)
plt.title('Test Subset - Distribution of Stars')

plt.tight_layout()
plt.show()

# Selecting numerical columns for correlation analysis
numeric_columns = stratified_train_df.select_dtypes(include='number').columns.tolist()

# Correlation matrix for train subset
train_corr = stratified_train_df[numeric_columns].corr()

# Correlation matrix for test subset
test_corr = stratified_test_df[numeric_columns].corr()

# Plotting heatmaps
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.heatmap(train_corr, annot=True, cmap='coolwarm')
plt.title('Train Subset - Correlation Heatmap')

plt.subplot(1, 2, 2)
sns.heatmap(test_corr, annot=True, cmap='coolwarm')
plt.title('Test Subset - Correlation Heatmap')

plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud

# Joining text from train and test subsets
train_text = ' '.join(stratified_train_df['text'].tolist())
test_text = ' '.join(stratified_test_df['text'].tolist())

# Creating word clouds for train and test subsets
wordcloud_train = WordCloud(width=800, height=400, background_color='white').generate(train_text)
wordcloud_test = WordCloud(width=800, height=400, background_color='white').generate(test_text)

# Plotting word clouds
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.imshow(wordcloud_train, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Train Subset')

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_test, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Test Subset')

plt.tight_layout()
plt.show()

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

# Apply preprocessing to your DataFrame 'text' column
stratified_train_df['processed_text'] = stratified_train_df['text'].apply(preprocess_text)
stratified_test_df['processed_text'] = stratified_test_df['text'].apply(preprocess_text)

In [None]:
from gensim.models import Word2Vec

# Train Word2Vec model on the processed text data
model_train = Word2Vec(sentences=stratified_train_df['processed_text'], vector_size=100, window=5, min_count=1, workers=4)
model_test = Word2Vec(sentences=stratified_test_df['processed_text'], vector_size=100, window=5, min_count=1, workers=4)

# Accessing word vectors
word_vectors_train = model_train.wv
word_vectors_test = model_test.wv

# Getting word vector for a specific word
vector = word_vectors_train['word']

# Finding similar words
similar_words = word_vectors_train.most_similar('word', topn=5)


In [None]:
print(similar_words)

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Function to transform text into averaged word vectors
def transform_text_to_vectors(text, word_vectors):
    vectorized_text = []
    for sentence in text:
        sentence_vectors = [word_vectors[word] for word in sentence if word in word_vectors]
        if sentence_vectors:
            vectorized_text.append(np.mean(sentence_vectors, axis=0))
        else:
            # If no word in the sentence is in the word_vectors, append zeros
            vectorized_text.append(np.zeros(word_vectors.vector_size))
    return np.array(vectorized_text)

# Transforming text data to numerical vectors
X_train = transform_text_to_vectors(stratified_train_df['processed_text'], word_vectors_train)
X_test = transform_text_to_vectors(stratified_test_df['processed_text'], word_vectors_test)

y_train = stratified_train_df['stars']
y_test = stratified_test_df['stars']

# Initialize Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy}")


In [None]:
# # Decision Tree with parameters tuned

# from sklearn.model_selection import GridSearchCV

# # Define the hyperparameters grid to search
# param_grid = {
#     'max_depth': [None, 5, 10, 15],  # Adjust these values as needed
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
#     # Add more hyperparameters and their values to explore
# }

# # Initialize Decision Tree Classifier
# clf = DecisionTreeClassifier(random_state=42)

# # Initialize GridSearchCV with the defined hyperparameters and cross-validation
# grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# # Fit the grid search to the training data
# grid_search.fit(X_train, y_train)

# # Get the best parameters and the best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print(f"Best Parameters: {best_params}")
# print(f"Best Accuracy Score: {best_score}")

# # Use the best estimator found by GridSearchCV to make predictions
# best_clf = grid_search.best_estimator_
# y_pred = best_clf.predict(X_test)

# # Calculate accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Decision Tree Accuracy after Hyperparameter Tuning: {accuracy}")

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_clf.fit(X_train, y_train)

# Predict on the test set
rf_y_pred = rf_clf.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f"Random Forest Accuracy: {rf_accuracy}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(title)
    plt.show()

# Decision Tree evaluation
print("Evaluation metrics for Decision Tree Classifier:")
print("------------------------------------------------")

# Confusion matrix and metrics
plot_confusion_matrix(y_test, y_pred, title="Decision Tree Confusion Matrix")
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))  # Setting zero_division=0

# Random Forest evaluation
print("\nEvaluation metrics for Random Forest Classifier:")
print("------------------------------------------------")

# Confusion matrix and metrics
plot_confusion_matrix(y_test, rf_y_pred, title="Random Forest Confusion Matrix")
print("Classification Report:")
print(classification_report(y_test, rf_y_pred, zero_division=0)) 