In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,KFold
from sklearn.pipeline import Pipeline,FeatureUnion
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chungkaichou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chungkaichou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chungkaichou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df=pd.read_csv('cleaned_data.csv')

In [4]:
lemmatizer = WordNetLemmatizer()
def text_preprocessor(text):
    text=re.sub(r'http[s]?://\S+', '', text)
    text=re.sub(r'@\w+','',text)
    text=re.sub(r'#','',text)
    text=re.sub(r'\d+','',text)
    tokens = word_tokenize(text)
    tokens=[word.lower() for word in tokens]
    tokens=[word for word in tokens if word not in string.punctuation]
    
    stop_words = set(stopwords.words('english'))
    filted_tokens=[word for word in tokens if word not in stop_words]
    
    lemmatized_tokens=[lemmatizer.lemmatize(token) for token in filted_tokens]
    preprocessed_text=' '.join(lemmatized_tokens)
    return preprocessed_text

In [5]:
df['preprocessed_text']=df['review'].apply(text_preprocessor)
X=df['preprocessed_text']
y=df['Recommended']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)
combined_vectorizer=FeatureUnion([('tfidf',TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', ngram_range=(1,3), use_idf=True, sublinear_tf=True, smooth_idf=True, stop_words='english')),('CountVectorizer',CountVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', ngram_range=(1,3), stop_words='english'))])

In [6]:
classifier={'Naive Bayes':MultinomialNB(),
            'Logistic Regression': LogisticRegression(max_iter=1000),
            'Random Forest': RandomForestClassifier()
}

vectorizer = {
    'tfidf': TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', ngram_range=(1,3), use_idf=True, sublinear_tf=True, smooth_idf=True, stop_words='english'),
    'CountVectorizer': CountVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', ngram_range=(1,3), stop_words='english'),
    'Combined': combined_vectorizer
}

MLresult=[]
for vec_name, vec in vectorizer.items():
    for clf_name,clf in classifier.items():
        pipe=Pipeline([
            (vec_name,vec),
            (clf_name,clf)
        ])
        pipe.fit(X_train,y_train)
        y_pred=pipe.predict(X_test)
        accuracy=accuracy_score(y_test,y_pred)
        MLresult.append({
            'Classifier': clf_name,
            'Vectorizer': vec_name,
            'Accuracy':accuracy
        })
MLresult=pd.DataFrame(MLresult)
MLresult

Unnamed: 0,Classifier,Vectorizer,Accuracy
0,Naive Bayes,tfidf,0.885452
1,Logistic Regression,tfidf,0.921954
2,Random Forest,tfidf,0.893033
3,Naive Bayes,CountVectorizer,0.869911
4,Logistic Regression,CountVectorizer,0.923357
5,Random Forest,CountVectorizer,0.893223
6,Naive Bayes,Combined,0.873323
7,Logistic Regression,Combined,0.923357
8,Random Forest,Combined,0.893336


In [None]:
#advanced NLP
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization