# Google Play Reviews - Sentiment Classification & App Recommendation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load CSV files (place in the same folder as this notebook)
apps_info = pd.read_csv("apps_info.csv")
apps_reviews = pd.read_csv("apps_reviews.csv")

In [None]:
# Preprocess: Drop NA and filter short reviews
apps_reviews.dropna(subset=['review_text', 'review_score'], inplace=True)
apps_reviews = apps_reviews[apps_reviews['review_text'].str.len() > 10]

# Define sentiment based on review_score
def label_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

apps_reviews['sentiment'] = apps_reviews['review_score'].apply(label_sentiment)

In [None]:
# Vectorize review text
X = apps_reviews['review_text']
y = apps_reviews['sentiment']

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_vec = vectorizer.fit_transform(X)

In [None]:
# Train/test split and model training
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [None]:
# Evaluate model
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Recommend top apps based on positive sentiment
positive_reviews = apps_reviews[apps_reviews['sentiment'] == 'positive']
recommendation_df = positive_reviews.groupby('app_id').agg({
    'review_text': 'count',
    'helpful_count': 'sum'
}).reset_index().rename(columns={'review_text': 'positive_review_count'})

recommendation_df = recommendation_df.merge(apps_info, on='app_id')
top_apps = recommendation_df.sort_values(by=['positive_review_count', 'helpful_count'], ascending=False).head(10)

print("Top Recommended Apps Based on Positive Sentiment:")
print(top_apps[['app_id', 'app_name', 'positive_review_count', 'helpful_count']])

In [None]:
# Save recommendations to CSV
top_apps.to_csv("recommended_apps.csv", index=False)