In [382]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [383]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ganes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ganes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ganes\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [384]:
docs = [
    "The quick brown fox jumps over the lazy dog",
    "I loved the movie, it was Amazing. 10/10 :).",
    "The Movie was terrible. waste of time!!",
    "A masterpeiece of storytelling. Would watch again."
]

In [385]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess a given text by converting it to lowercase, removing punctuation and digits,
    tokenizing it, removing stopwords, and lemmatizing the tokens.
    Args:
        text (str): The text to preprocess.
    Returns:
        str: The preprocessed text.
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(rf"[{string.punctuation}]", " ", text)  # Remove punctuation
    text = re.sub(r'\d+', " ", text)  # Remove digits
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize the tokens
    return " ".join(tokens)  # Join the tokens back into a string

processed_docs = [preprocess_text(text) for text in docs]

for i, doc in enumerate(processed_docs):
    print(f"Original: {docs[i]}")
    print(f"Processed: {doc}")
    print("." * 50)

Original: The quick brown fox jumps over the lazy dog
Processed: quick brown fox jump lazy dog
..................................................
Original: I loved the movie, it was Amazing. 10/10 :).
Processed: loved movie amazing
..................................................
Original: The Movie was terrible. waste of time!!
Processed: movie terrible waste time
..................................................
Original: A masterpeiece of storytelling. Would watch again.
Processed: masterpeiece storytelling would watch
..................................................


In [386]:
from sklearn.feature_extraction.text import CountVectorizer  # type: ignore



In [387]:
docs = ["I love programming.", "programming is fun", "I love fun activities","Programming is just Amazing"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)
print("Vocabulary:", vectorizer.get_feature_names_out())
print("Encoded Matrix:\n", X.toarray())


Vocabulary: ['activities' 'amazing' 'fun' 'is' 'just' 'love' 'programming']
Encoded Matrix:
 [[0 0 0 0 0 1 1]
 [0 0 1 1 0 0 1]
 [1 0 1 0 0 1 0]
 [0 1 0 1 1 0 1]]


In [388]:
import pandas as pd
from nltk.stem.porter import PorterStemmer

In [389]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ganes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [390]:
df=pd.read_csv('Restaurant_Reviews .tsv',delimiter='\t',quoting=3)

In [391]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [392]:
len(df)

1000

In [393]:
df.shape

(1000, 2)

In [394]:
df["Review"][0]

'Wow... Loved this place.'

In [395]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [396]:
df.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [397]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [398]:
import re

In [399]:
import string
import re
corpus=[]

for i in range(1,1000):
   
    review = re.sub(pattern='[^a-zA-Z]', repl=' ', string=df['Review'][i])
    review = review.lower()
    review_words = review.split()
    review_words = [word for word in review_words if word not in set(stopwords.words('english'))]
    ps=PorterStemmer()
    review1 = ' '.join([ps.stem(word) for word in review_words])  # Join the words back into a string
    corpus.append(review1)  # Append as a string

In [400]:
print(corpus)

['crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place worth time let alon vega', 'like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could happier', 'seem like good quick place gra

In [401]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [402]:
df.shape

(1000, 2)

In [403]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=10000)

In [404]:
df["Review"]

0                               Wow... Loved this place.
1                                     Crust is not good.
2              Not tasty and the texture was just nasty.
3      Stopped by during the late May bank holiday of...
4      The selection on the menu was great and so wer...
                             ...                        
995    I think food should have flavor and texture an...
996                             Appetite instantly gone.
997    Overall I was not impressed and would not go b...
998    The whole experience was underwhelming, and I ...
999    Then, as if I hadn't wasted enough of my life ...
Name: Review, Length: 1000, dtype: object

In [405]:
X=cv.fit_transform(corpus).toarray()

In [406]:
X.shape

(999, 1565)

In [407]:
X[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(1565,))

In [408]:
X[0].max()

np.int64(1)

In [409]:
y=df.iloc[:,-1].values

In [410]:
y.shape

(1000,)

In [411]:
print("Shape of X:", X.shape)
print("Length of y:", len(y))


Shape of X: (999, 1565)
Length of y: 1000


In [412]:
# Check shapes
print(X.shape)  # Should print (999, n_features)
print(y.shape)  # Should print (1000,)

# Fix the mismatch by removing the extra sample from y
y = y[:999]

# Verify shapes again
print(X.shape)  # Should print (999, n_features)
print(y.shape)  # Should print (999,)

# Now split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9, stratify=y, shuffle=True)

(999, 1565)
(1000,)
(999, 1565)
(999,)


In [413]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB # type: ignore

In [414]:
from sklearn.ensemble import RandomForestClassifier

clf3 = RandomForestClassifier()


In [415]:
clf3 = RandomForestClassifier()  # Corrected to define clf3 properly
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [416]:
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [417]:
y_predG=clf1.predict(X_test)
y_predM=clf2.predict(X_test)
y_predB=clf3.predict(X_test)



In [418]:
from sklearn.metrics import accuracy_score # type: ignore

In [419]:
accuracy_score(y_test,y_predG)

0.49

In [420]:
accuracy_score(y_test,y_predM)

0.505

In [421]:
accuracy_score(y_test,y_predB)

0.505

In [422]:
print("Gaussian Accuracy Score is:",accuracy_score(y_test,y_predG))
print("Multinomail Accuracy Score is:",accuracy_score(y_test,y_predM))
print("Bernoulli Accuracy Score is:",accuracy_score(y_test,y_predB))


Gaussian Accuracy Score is: 0.49
Multinomail Accuracy Score is: 0.505
Bernoulli Accuracy Score is: 0.505


In [423]:
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.metrics import accuracy_score # type: ignore
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.54

In [424]:
from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
y_pred1=xgb.predict(X_test)
accuracy_score=(y_test,y_pred1)

In [425]:
from sklearn.metrics import confusion_matrix # type: ignore

In [426]:
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Confusion Matrix:
[[55 45]
 [47 53]]


In [427]:
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred1))

XGBoost Confusion Matrix:
[[51 49]
 [49 51]]


In [428]:
from sklearn.metrics import accuracy_score

# Assuming y_test and y_pred are defined and valid
accuracy_percentage = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy of Random Forest : {accuracy_percentage:.2f}%")




Accuracy of Random Forest : 54.00%


In [429]:
accuracy_percentage = accuracy_score(y_test, y_pred1) * 100
print(f"Accuracy of XGBoost: {accuracy_percentage:.2f}%")


Accuracy of XGBoost: 51.00%
