<a href="https://colab.research.google.com/github/OmniaOsama03/Artificial_Intelligence_Programming_Labs/blob/main/Lab%207.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Name: Omnia Osama Ahmed




In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Load the dataset
data = pd.read_csv("Hotel_review.csv")

#Displaying last 8 rows
data.tail(8)

Unnamed: 0,Review,Rating
141,"terrible experience awful night stay, planned ...",1
142,good location inside room moore old place good...,3
143,"loved moore hotel, just got night stay moore h...",4
144,great location great price stayed moore nights...,3
145,"best budget hotel seattle, reviews right best ...",4
146,"surprising enjoyable stay, 3 hotels stayed tri...",4
147,moore hotel greatest place charming quaint fri...,5
148,"stay, stayed moore 3 nights- n't better time, ...",5


In [None]:
# Preprocessing functions
def preprocess_text(text):
    text = text.lower()

    #   Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    #Regex Processing
    text = re.sub(r'\b\w*\d+\w*\b', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\b\w*_\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text


# Apply preprocessing to the reviews
data['Processed_Review'] = data['Review'].apply(preprocess_text)

#Displaying before and after preprocessing
print('Before preprocessing\n', data['Review'])
print('----------------------------------------------')
print('After preprocessing\n', data['Processed_Review'])

Before preprocessing
 0      nice hotel expensive parking got good deal sta...
1      ok nothing special charge diamond member hilto...
2      nice rooms not 4* experience hotel monaco seat...
3      unique, great stay, wonderful time hotel monac...
4      great stay great stay, went seahawk game aweso...
                             ...                        
144    great location great price stayed moore nights...
145    best budget hotel seattle, reviews right best ...
146    surprising enjoyable stay, 3 hotels stayed tri...
147    moore hotel greatest place charming quaint fri...
148    stay, stayed moore 3 nights- n't better time, ...
Name: Review, Length: 149, dtype: object
----------------------------------------------
After preprocessing
 0      nice hotel expens park got good deal stay hote...
1      noth special charg diamond member hilton decid...
2      nice room experi hotel monaco seattl good hote...
3      uniqu great stay wonder time hotel monaco loca...
4      great s

In [None]:
# Creating the Bag of Words model
vectorizer = CountVectorizer(min_df = 0.01, max_df = 0.70, binary = False)

data_vectorized = vectorizer.fit_transform(data['Processed_Review'])

# Prepare word vectors for clustering
bow_df = pd.DataFrame(data_vectorized.toarray(), columns=vectorizer.get_feature_names_out())

#Visualizing
print('Bag of Words Dataframe: \n')
print(bow_df)

# K-means clustering
word_features = vectorizer.get_feature_names_out()
word_vectors = data_vectorized.T.tocsr()

num_clusters = 4
kmeans = KMeans(n_clusters = num_clusters, random_state=0, n_init=20, max_iter=400)
kmeans.fit(word_vectors)
labels = kmeans.labels_

# Mapping words to their cluster labels
word_cluster = {word_features[i]: labels[i] for i in range(len(word_features))}

# Sorting and displaying words grouped by their clusters
clustered_words = {}
for word, cluster in word_cluster.items():
    if cluster not in clustered_words:
        clustered_words[cluster] = []
    clustered_words[cluster].append(word)

for cluster in sorted(clustered_words):
    print(f"Cluster {cluster}: {clustered_words[cluster]}")


word = 'good'
word_index = vectorizer.vocabulary_.get(word, None)

if word_index is not None:

    word_frequency = data_vectorized[:, word_index].sum()

    print(f"Frequency of '{word}':", word_frequency)
else:
    print(f"'{word}' not found in the dataset.")


Bag of Words Dataframe: 

     aaa  abl  absolut  accept  access  accommod  accomod  ace  actual  ad  \
0      0    0        0       0       0         0        0    0       0   0   
1      0    0        0       0       0         0        0    0       0   0   
2      0    0        0       0       0         0        0    0       0   0   
3      0    0        0       0       0         0        0    0       0   0   
4      0    0        0       1       0         0        0    0       0   0   
..   ...  ...      ...     ...     ...       ...      ...  ...     ...  ..   
144    0    0        0       0       0         0        0    0       0   0   
145    0    0        0       0       0         0        0    0       0   0   
146    0    0        0       0       0         0        0    0       0   0   
147    0    0        0       0       0         0        0    0       0   0   
148    0    0        0       0       0         0        0    0       0   0   

     ...  worth  write  written  wron

In [None]:

# Tokenizing the data
tokenized_data = [nltk.word_tokenize(text) for text in data['Processed_Review']]

#Displaying the tokens
print('Tokens:')
print(tokenized_data)
print('-------------------------------------------------------------------------------- \n')

# Creating the Word2Vec model
word2vec_model = Word2Vec(sentences = tokenized_data, vector_size = 500, window = 6, min_count = 2, workers = 7)


# Training the Word2Vec model
word2vec_model.train(tokenized_data, total_examples=word2vec_model.corpus_count, epochs=10)

# Prepare word vectors for clustering
word_vectors = word2vec_model.wv.vectors

words = list(word2vec_model.wv.index_to_key)

# K-means clustering
num_clusters = 4
kmeans = KMeans(n_clusters = num_clusters, random_state=0)
kmeans.fit(word_vectors)
labels = kmeans.labels_

# Mapping words to their cluster labels
word_cluster = {words[i]: labels[i] for i in range(len(words))}

# Sorting and displaying words grouped by their clusters
clustered_words = {}
for word, cluster in word_cluster.items():
    if cluster not in clustered_words:
        clustered_words[cluster] = []
    clustered_words[cluster].append(word)

for cluster in sorted(clustered_words):
    print(f"Cluster {cluster}: {clustered_words[cluster]}")

# Exploring the model
word = 'trip'
similar_words =word2vec_model.wv.most_similar(word)

# Printing each similar word on a new line
print(f"Words most similar to '{word}':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")


Tokens:
[['nice', 'hotel', 'expens', 'park', 'got', 'good', 'deal', 'stay', 'hotel', 'anniversari', 'arriv', 'late', 'even', 'took', 'advic', 'previou', 'review', 'valet', 'park', 'check', 'quick', 'easi', 'littl', 'disappoint', 'nonexist', 'view', 'room', 'room', 'clean', 'nice', 'size', 'bed', 'comfort', 'woke', 'stiff', 'neck', 'high', 'pillow', 'soundproof', 'like', 'heard', 'music', 'room', 'night', 'morn', 'loud', 'bang', 'door', 'open', 'close', 'hear', 'peopl', 'talk', 'hallway', 'mayb', 'noisi', 'neighbor', 'aveda', 'bath', 'product', 'nice', 'goldfish', 'stay', 'nice', 'touch', 'taken', 'advantag', 'stay', 'longer', 'locat', 'great', 'walk', 'distanc', 'shop', 'overal', 'nice', 'experi', 'pay', 'park', 'night'], ['noth', 'special', 'charg', 'diamond', 'member', 'hilton', 'decid', 'chain', 'shot', 'anniversari', 'seattl', 'start', 'book', 'suit', 'paid', 'extra', 'websit', 'descript', 'not', 'suit', 'bedroom', 'bathroom', 'standard', 'hotel', 'room', 'took', 'print', 'reserv',



Cluster 0: ['quiet', 'choic', 'mayflow', 'housekeep', 'near', 'key', 'conveni', 'bu', 'site', 'late', 'spent', 'coupl', 'reason', 'checkin', 'budget', 'waterfront', 'neighborhood', 'quick', 'dinner', 'money', 'fit', 'inform', 'rude', 'fabul', 'extra', 'doubl', 'eat', 'amen', 'total', 'super', 'read', 'dine', 'comfi', 'care', 'convent', 'option', 'airport', 'beat', 'nonsmok', 'complaint', 'let', 'hope', 'base', 'met', 'disappoint', 'basic', 'music', 'bottl', 'furnish', 'condit', 'chain', 'kitchenett', 'live', 'sorri', 'champagn', 'advanc', 'cancel', 'wireless', 'not', 'hall', 'paid', 'prefer', 'pictur', 'avoid', 'pricey', 'posit', 'outsid', 'tast', 'son', 'squar', 'fresh', 'separ', 'take', 'famili', 'receiv', 'suggest', 'knew', 'amaz', 'apart', 'awesom', 'public', 'mention', 'goldfish', 'run', 'arena', 'luxuri', 'funki', 'smell', 'given', 'hot', 'mayb', 'walkin', 'hous', 'accommod', 'complimentari', 'show', 'state', 'fridg', 'warm', 'spoke', 'issu', 'iron', 'twice', 'concern', 'dingi', 