In [31]:


import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("/googleplaystore_user_reviews.csv")
df = df[['Translated_Review', 'Sentiment']]
df = df.dropna()

# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_review'] = df['Translated_Review'].apply(clean_text)

# Encode labels
df['Sentiment'] = df['Sentiment'].map({'Negative':0, 'Neutral':1, 'Positive':2})
df = df.dropna()
df['Sentiment'] = df['Sentiment'].astype(int)

print("Total samples:", len(df))

# Build X and y
X = df['clean_review']
y = df['Sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate model
y_pred = model.predict(X_test_tfidf)
print("\nCLASSIFICATION REPORT:\n")
print(classification_report(y_test, y_pred))

# Create output dataset for dashboard
label_map = {0:'Negative', 1:'Neutral', 2:'Positive'}

results = pd.DataFrame({
    'Review': X_test.values,
    'Actual_Sentiment': [label_map[i] for i in y_test],
    'Predicted_Sentiment': [label_map[i] for i in y_pred]
})

results.to_csv("/content/sentiment_analysis_output.csv", index=False)
print("\nSaved: sentiment_analysis_output.csv")

# Download
from google.colab import files
files.download("/googleplaystore_user_reviews.csv")


Total samples: 37427

CLASSIFICATION REPORT:

              precision    recall  f1-score   support

           0       0.90      0.79      0.84      1653
           1       0.84      0.82      0.83      1049
           2       0.91      0.95      0.93      4784

    accuracy                           0.90      7486
   macro avg       0.88      0.85      0.87      7486
weighted avg       0.90      0.90      0.90      7486


Saved: sentiment_analysis_output.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>