In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
#for request website 
import requests
import urllib.parse

#for Nlp
import re, nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk import ngrams


# nltk.download('stopwords')
# nltk.download('vader_lexicon')
# nltk.download('averaged_perceptron_tagger')
stop_words = list(set(stopwords.words('english')))
wordnet_lemmatizer = WordNetLemmatizer()
# nltk.download('wordnet')
%matplotlib inline


from sklearn.feature_extraction.text import CountVectorizer


In [2]:
reviews_df=pd.read_csv('../input/data-preprocessing-eda-hotel-reviews/data_hotel_reviews_clean.csv')

In [3]:
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]


In [4]:
reviews_df["satisfied"] = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x > 7.5 else 0)
reviews_df["satisfied"].value_counts(normalize = True)


In [5]:
reviews_df = reviews_df[["review", "satisfied"]]

In [6]:
reviews_df = reviews_df.sample(frac = 0.1, replace = False, random_state=42)

In [7]:
import matplotlib.pyplot as plt 
import seaborn as sns
palette = sns.color_palette("bright")
sns.countplot(x='satisfied', data=reviews_df,palette="Set2")
plt.title("ratio of satisfied");

In [8]:
reviews_df["review"] = reviews_df["review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))

In [9]:
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    text =[wordnet_lemmatizer.lemmatize(x) for x in text]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(x))


In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["review"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)


In [11]:
# add number of characters column
reviews_df["nb_chars"] = reviews_df["review"].apply(lambda x: len(x))

# add number of words column
reviews_df["nb_words"] = reviews_df["review"].apply(lambda x: len(x.split(" ")))

In [12]:
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(reviews_df["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = reviews_df.index
reviews_df = pd.concat([reviews_df, tfidf_df], axis=1)

In [None]:
reviews_df.to_csv("data_for_train_model.csv",index=False)

In [None]:
data_out=pd.read_csv('data_for_train_model.csv')

In [None]:
data_out.columns

In [13]:
label = "satisfied"
ignore_cols = [label, "review", "review_clean"]
features = [c for c in reviews_df.columns if c not in ignore_cols]

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( reviews_df[features],reviews_df["satisfied"], test_size = 0.25, random_state = 42,stratify = reviews_df["satisfied"])

In [15]:
from imblearn.over_sampling import SMOTE
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='minority', random_state=42)
# Fit the model to generate the data.
oversampled_X, oversampled_Y = sm.fit_resample(X_train, y_train)


In [18]:
oversampled = pd.concat([pd.DataFrame(oversampled_Y), pd.DataFrame(oversampled_X)], axis=1)
# import matplotlib.pyplot as plt 
print(oversampled.columns)
# import seaborn as sns
sns.countplot(x='satisfied', data=oversampled,palette="Set2")
plt.title("ratio of satisfied");

In [19]:
rf = RandomForestClassifier(n_estimators = 400,max_depth=None, max_features= "auto", random_state = 42)
rf.fit(oversampled_X, oversampled_Y)

In [20]:
y_pred_=rf.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix , classification_report
from mlxtend.plotting import plot_confusion_matrix
conf = confusion_matrix(y_test , y_pred_)
plot_confusion_matrix(conf)
print (classification_report(y_test , y_pred_))

based on classification Report and its results at all not fine and beacuse of imblanced problem Although we have deal with Smote but it still there is problem with accuracy, we decide to choose another approach :
1. select random sampels from majority class  (here satsisfied=1) and equal as sample in another class (non satisfied here =0)
2. train classifiers on positive and label positive (=1) and negative Reviews with label negative (=0) 
link of new notebook [sentiment Analysis](https://www.kaggle.com/nourchocharah/modeling-with-other-approach/edit)