In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/google_play_store_apps_reviews_training.csv"
data = pd.read_csv(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def preprocess_data(data):
    
    data = data.drop('package_name', axis=1)
    
    
    data['review'] = data['review'].str.strip().str.lower()
    
    return data

In [None]:
data = preprocess_data(data)
x = data['review']
y = data['polarity']
x, x_test, y, y_test = train_test_split(x, y, stratify=y, test_size=0.25, random_state=42)

In [None]:
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()


In [None]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        
        # Compute the prior probabilities of each class
        self.class_prior_ = np.zeros(n_classes)
        for i, c in enumerate(self.classes_):
            self.class_prior_[i] = np.sum(y == c) / n_samples
        
        # Compute the number of times each feature appears in each class
        self.feature_counts = np.zeros((n_classes, n_features))
        for i, c in enumerate(self.classes_):
            self.feature_counts[i, :] = np.sum(X[y == c, :], axis=0)
            
        # Compute the total number of times each feature appears
        self.feature_totals = np.sum(self.feature_counts, axis=1)
        
        # Compute the logarithm of the class conditional probabilities
        self.feature_log_prob_ = (np.log(self.feature_counts + self.alpha)
                                  - np.log(self.feature_totals.reshape(-1, 1)
                                           + self.alpha * n_features))
    
    def predict_log_proba(self, X):
        n_samples, n_features = X.shape
        log_prob = np.zeros((n_samples, len(self.classes_)))
        for i, c in enumerate(self.classes_):
            
            log_prob[:, i] = np.log(self.class_prior_[i])
            
            
            log_prob[:, i] += np.sum(self.feature_log_prob_[i, :].reshape(1, -1) * X, axis=1)
        
        return log_prob
    def predict(self, X):
        
        return np.argmax(self.predict_log_proba(X), axis=1)
    
model = MultinomialNB()
model.fit(x, y)

In [None]:
score = np.mean(model.predict(x_test) == y_test)
score

0.8565022421524664

In [None]:
predict = model.predict(vec.transform(['very smooth interface']).toarray())

if predict[0] == 1:
    print("positive review")
else:
    print("negative review")

positive review
