# Plan of Action

1. Load Play Store App Reviews dataset (12,000 reviews)
2. Pre-process dataset by removing special characters, numbers, etc. from user reviews + convert sentiment labels positive & negative to numbers 1 & 0, respectively
3. Import GloVe Word Embedding to build Embedding Dictionary + Use this to build Embedding Matrix for our Corpus
4. Model Training using Deep Learning in Keras for separate: Simple Neural Net, CNN and LSTM Models and analyse model performance and results
5. Last, perform predictions on real App Store reviews

# New Section

**Installing Libraries**

In [None]:
!pip install google_trans_new
!pip install textblob
!pip install googletrans==3.1.0a0
!pip install transformers
!pip install contractions
!pip install imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Importing Libraries**

In [None]:
from textblob import TextBlob
from  nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import contractions
import pandas as pd
import numpy as np
import re
import nltk.corpus
import string
import googletrans
from nltk.corpus import words
from googletrans import Translator
from nltk.corpus import stopwords
#Visualization libraries
import matplotlib.pyplot as plt
import plotly.express as px
#model library
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from collections import Counter
import lightgbm as lgb  

 **Creating Objects**

In [None]:
from nltk.tokenize import WhitespaceTokenizer
tk = WhitespaceTokenizer()
#nlp = English()
strin = string.punctuation
translator = googletrans.Translator()
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
stop_words.remove('not')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


**Loading File**

In [None]:
#rest will convert the file into dataframes
rest = pd.read_csv('/content/sample_data/reviews.csv')

In [None]:
#validating the first 5 record in the dataframes
rest.shape

(12495, 12)

In [None]:
rest['pred'] = rest.score.apply(lambda x: 0 if x in [1, 2,3] else 1)

MODEL IMPLEMENTATION

In [None]:
from sklearn.model_selection import StratifiedKFold
X_train, X_test, y_train, y_test = train_test_split(rest.content,rest.pred, test_size=0.2, random_state=1,stratify= rest.pred)

**Data Pre-Processing**

In [None]:
def clean_text(Reviews):
  new_reviews = translator.translate(Reviews,dest='en', src='auto').text
  #src.append(translator.translate(Reviews).src)
  Reviews = ''.join([contractions.fix(new_reviews)])
  text = Reviews.lower()
  punct = ''.join([ch for ch in text if ch not in string.punctuation])
  punct = re.sub(r'[^\w\s]+',"",punct)
  remove_number = re.sub('\d+','',punct)
  repeat_text = re.sub(r'(.)\1+', r'\1\1', remove_number)
  text = repeat_text.strip()
  text = ' '.join(text.split())
  text = word_tokenize(text)
  text= [word.lower() for word in text if word.lower() not in stop_words]
  lemmatized_string = ' '.join([lemmatizer.lemmatize(words) for words in text])
  return lemmatized_string 

In [None]:
X_train = X_train.apply(lambda x : clean_text(x))

**Model Implementation**

In [None]:
# Creating Model
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1,3),)
tfidf.fit(list(X_train))
tf_idf_vector_train = tfidf.transform(X_train)
tf_idf_vector_test = tfidf.transform(X_test)


In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_smt, y_train_smt = sm.fit_resample(tf_idf_vector_train, y_train.ravel())
print(X_train_smt.shape,y_train_smt.shape)


(10946, 206558) (10946,)


**Word EMBEDDING**

In [None]:
'''
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(X_train)
train_text_vec = tokenizer.texts_to_sequences(X_train)
tokenizer.fit_on_texts(X_test)
test_text_vec = tokenizer.texts_to_sequences(X_test)
test_text_vec
'''

In [None]:
rf = RandomForestClassifier(max_depth=60, min_samples_leaf=5,
                            min_samples_split=6, n_estimators=60, oob_score=True,
                            max_leaf_nodes=55, random_state=13)
rf.fit(X_train_smt, y_train_smt)
predictions_rf_test = pd.Series(rf.predict(tf_idf_vector_test))

train_scoreRF = rf.score(X_train_smt, y_train_smt)
test_scoreRF = rf.score(tf_idf_vector_test, y_test)

# Train and Test R2 Scores
print(f"RF Train R2 Score: {round(train_scoreRF, 3)}")
print(f"RF Test R2 Score: {round(test_scoreRF, 3)}")

RF Train R2 Score: 0.808
RF Test R2 Score: 0.725


In [None]:
lr = LogisticRegression(random_state=13, max_iter=2000).fit(X_train_smt, y_train_smt)

predictions_LR_test = pd.Series(lr.predict(tf_idf_vector_test))

train_scoreLR = lr.score(X_train_smt, y_train_smt)
test_scoreLR = lr.score(tf_idf_vector_test, y_test)

# Train and Test R2 Scores
print(f"LR Train R2 Score: {round(train_scoreLR, 3)}")
print(f"LR Test R2 Score: {round(test_scoreLR, 3)}")

LR Train R2 Score: 0.93
LR Test R2 Score: 0.812


In [None]:
import lightgbm as lgb 
lgb = lgb.LGBMClassifier().fit(tf_idf_vector_train, y_train)

predictions_LGB_test = pd.Series(lgb.predict(tf_idf_vector_test))

train_scoreLGB = lgb.score(tf_idf_vector_train, y_train)
test_scoreLGB = lgb.score(tf_idf_vector_test, y_test)

# Train and Test R2 Scores
print(f"LGB Train R2 Score: {round(train_scoreLGB, 3)}")
print(f"LGB Test R2 Score: {round(test_scoreLGB, 3)}")

LGB Train R2 Score: 0.889
LGB Test R2 Score: 0.783


In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
CNB = ComplementNB()
CNB.fit(tf_idf_vector_train, y_train)


train_scoreCNB = CNB.score(tf_idf_vector_train, y_train)
test_scoreCNB = CNB.score(tf_idf_vector_test, y_test)

predicted = CNB.predict(tf_idf_vector_test)

# Train and Test R2 Scores
print(f"LGB Train R2 Score: {round(train_scoreCNB, 3)}")
print(f"LGB Test R2 Score: {round(test_scoreCNB, 3)}")


LGB Train R2 Score: 0.959
LGB Test R2 Score: 0.814


In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(tf_idf_vector_train, y_train)
predictions_MNB_test = pd.Series(MNB.predict(tf_idf_vector_test))

train_scoreMNB = MNB.score(tf_idf_vector_train, y_train)
test_scoreMNB = MNB.score(tf_idf_vector_test, y_test)

# Train and Test R2 Scores
print(f"LGB Train R2 Score: {round(train_scoreMNB, 3)}")
print(f"LGB Test R2 Score: {round(test_scoreMNB, 3)}")

LGB Train R2 Score: 0.948
LGB Test R2 Score: 0.792


In [None]:
test_data= "nothing missing"
clean_data = clean_text(test_data)
print(clean_data)
tf_idf_vector_test = tfidf.transform([clean_data])
res=lr.predict(tf_idf_vector_test)
res = np.array(res[0])
if res == 1:
  print('Sentiment type:Positive',res)
elif res == 0:
  print('Sentiment type:Negative',res)
  print('==================================================\n')

nothing missing
Sentiment type:Negative 0



In [None]:
from textblob import TextBlob
 
 

In [None]:
def trans(new):

  gfg = TextBlob(new).correct
 
  # Reviews = ''.join([contractions.fix(new)])
  # new_reviews = translator.translate(Reviews,dest='en', src='auto').text
  # #src.append(translator.translate(Reviews).src)
  
  return gfg

In [None]:
testing = "y cudn't enuf alone "

In [None]:
clean_new_data_file = trans(testing)
clean_new_data_file

<bound method BaseBlob.correct of TextBlob("y cudn't enuf alone ")>