In [1]:
import pandas as pd
df = pd.read_csv("/media/vedic/FEBE11DFBE11916F/STUDY/Summer Internship 19/Hotel_Reviews.csv")

In [2]:
df["review"] = df["Negative_Review"] + df["Positive_Review"]

In [3]:
df = df[["review", "Reviewer_Score"]]

In [4]:
df = df.sample(frac = 0.1, replace = False, random_state = 42)

In [5]:
df["review"] = df["review"].apply(lambda x: x.replace("No Negative", " ").replace("No Positive", " "))

In [6]:
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

In [7]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [9]:
df["review_clean"] = df["review"].apply(lambda x: clean_text(x))

In [10]:
df["nb_chars"] = df["review"].apply(lambda x: len(x))
df["nb_words"] = df["review"].apply(lambda x: len(x.split(" ")))

In [11]:
del string, wordnet, pos_tag, stopwords, WhitespaceTokenizer, WordNetLemmatizer

In [12]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [13]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df["review_clean"].apply(lambda x: x.split(" ")))]

In [14]:
model = Doc2Vec(documents, vector_size = 8, window = 2, min_count = 1, workers = 4)
del documents

In [15]:
doc2vec_df = df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
del model

In [16]:
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]

In [17]:
df = pd.concat([df, doc2vec_df], axis = 1)
del doc2vec_df

In [18]:
df

Unnamed: 0,review,Reviewer_Score,review_clean,nb_chars,nb_words,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,doc2vec_vector_5,doc2vec_vector_6,doc2vec_vector_7
488440,Would have appreciated a shop in the hotel th...,9.6,would appreciate shop hotel sell drinking wate...,599,113,0.176638,0.173414,0.081300,-0.191280,-0.100201,0.140569,0.090222,-0.191968
274649,No tissue paper box was present at the room,8.8,tissue paper box present room,45,11,0.009119,0.045774,-0.002120,-0.013836,-0.064593,-0.064496,0.017815,0.046757
374688,Pillows Nice welcoming and service,7.9,pillow nice welcome service,36,7,-0.000269,0.019718,0.019541,0.024709,0.031438,-0.085529,0.011390,-0.043415
404352,Everything including the nice upgrade The Ho...,10.0,everything include nice upgrade hotel revamp s...,156,28,0.029042,-0.075240,0.102950,0.033232,0.021515,0.027849,0.098392,-0.030668
451596,Lovely hotel v welcoming staff,9.6,lovely hotel welcome staff,33,8,-0.072975,-0.006961,0.072773,0.064139,-0.067125,-0.061341,-0.081730,0.041908
302161,They don t have free wifi The location is per...,8.3,free wifi location perfect lot time want look ...,130,32,-0.091530,0.091847,-0.012669,-0.040641,-0.028602,-0.036500,0.052346,-0.064717
317079,Room generally a bit shabby with some lack of...,7.5,room generally bit shabby lack maintenance cru...,318,57,-0.095494,-0.063278,-0.193582,-0.192262,-0.110890,-0.054439,-0.084652,-0.149004
13963,Executive rooms 9th Floor don t have a bath T...,7.9,executive room floor bath website make look li...,483,93,0.008801,0.083625,-0.000762,-0.067136,-0.002113,-0.024241,-0.109468,0.015955
159785,Pity about the two days of rain Its centralit...,9.2,pity two day rain centrality proximity destina...,76,14,-0.118347,0.108801,-0.037990,-0.066863,-0.209010,-0.058322,-0.048550,0.043798
195089,Didn t like it at all construction was in pro...,2.5,like construction progress stuff lie vacancy l...,186,42,-0.081757,-0.049232,-0.131912,0.057514,-0.192138,0.086055,-0.019454,0.015116


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tfidf = TfidfVectorizer(min_df = 10)

In [21]:
tfidf_result = tfidf.fit_transform(df["review_clean"]).toarray()

In [22]:
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
del tfidf_result
del tfidf

In [23]:
tfidf_df.index = df.index

In [24]:
df = pd.concat([df , tfidf_df], axis = 1)
del tfidf_df

In [25]:
df

Unnamed: 0,review,Reviewer_Score,review_clean,nb_chars,nb_words,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,...,word_yet,word_yoghurt,word_yogurt,word_young,word_yr,word_yummy,word_zero,word_ziggo,word_zone,word_zuid
488440,Would have appreciated a shop in the hotel th...,9.6,would appreciate shop hotel sell drinking wate...,599,113,0.176638,0.173414,0.081300,-0.191280,-0.100201,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
274649,No tissue paper box was present at the room,8.8,tissue paper box present room,45,11,0.009119,0.045774,-0.002120,-0.013836,-0.064593,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
374688,Pillows Nice welcoming and service,7.9,pillow nice welcome service,36,7,-0.000269,0.019718,0.019541,0.024709,0.031438,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
404352,Everything including the nice upgrade The Ho...,10.0,everything include nice upgrade hotel revamp s...,156,28,0.029042,-0.075240,0.102950,0.033232,0.021515,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
451596,Lovely hotel v welcoming staff,9.6,lovely hotel welcome staff,33,8,-0.072975,-0.006961,0.072773,0.064139,-0.067125,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
302161,They don t have free wifi The location is per...,8.3,free wifi location perfect lot time want look ...,130,32,-0.091530,0.091847,-0.012669,-0.040641,-0.028602,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
317079,Room generally a bit shabby with some lack of...,7.5,room generally bit shabby lack maintenance cru...,318,57,-0.095494,-0.063278,-0.193582,-0.192262,-0.110890,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
13963,Executive rooms 9th Floor don t have a bath T...,7.9,executive room floor bath website make look li...,483,93,0.008801,0.083625,-0.000762,-0.067136,-0.002113,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
159785,Pity about the two days of rain Its centralit...,9.2,pity two day rain centrality proximity destina...,76,14,-0.118347,0.108801,-0.037990,-0.066863,-0.209010,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
195089,Didn t like it at all construction was in pro...,2.5,like construction progress stuff lie vacancy l...,186,42,-0.081757,-0.049232,-0.131912,0.057514,-0.192138,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
label = "Reviewer_Score"
ignore_cols = ["Reviewer_Score", "review", "review_clean"]

In [30]:
features = [c for c in df.columns if c not in ignore_cols]

In [31]:
from sklearn.linear_model import Lasso

In [32]:
#X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size = 0.2, random_state = 42)
X = df[features]
y = df[label]

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
parameters = {'alpha' : [0.001, 0.0001, 0.00001]}

lasso = Lasso()
lasso_regressor = GridSearchCV(lasso, parameters, cv=5)
lasso_regressor.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.001, 0.0001, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
print(lasso_regressor.best_params_)

{'alpha': 0.0001}


In [36]:
print(lasso_regressor.best_score_)

0.4472362109073895


In [38]:
import pickle
pickle.dump(lasso_regressor, open("Rating Generation Model.sav", "wb"))

In [40]:
model = pickle.load(open("Rating Generation Model.sav", "rb"))

In [58]:
lasso_regressor.predict(X)

array([9.1424648 , 7.49765332, 9.15060496, ..., 8.38282224, 7.50513355,
       7.29572051])