In [1]:
import pandas as pd
import spacy

In [2]:
# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

In [3]:
# Read the CSV file into a pandas DataFrame
data = pd.read_csv("/content/Restaurant_Reviews.csv")
data.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
# Preprocess the text column
def preprocess_text(Review):
    # Process the text
    doc = nlp(Review)

    # Tokenization
    tokens = [token.text for token in doc]

    # Lemmatization
    lemmas = [token.lemma_ for token in doc]

    # Join the lemmas back into a preprocessed text
    preprocessed_text = " ".join(lemmas)

    return preprocessed_text

# Apply the preprocessing function to the text column
data["Review"] = data["Review"].apply(preprocess_text)

# Save the preprocessed data to a new CSV file
data.to_csv("preprocessed_data.csv", index=False)


In [9]:
data1 = pd.read_csv("preprocessed_data.csv")
data1.head()

Unnamed: 0,Review,Liked
0,wow ... love this place .,1
1,crust be not good .,0
2,not tasty and the texture be just nasty .,0
3,stop by during the late May bank holiday off R...,1
4,the selection on the menu be great and so be t...,1


In [19]:
pos = pd.DataFrame()

In [23]:
# Preprocess the text column
def preprocess_text(Review):
    # Process the text
    doc = nlp(Review)

    # Perform linguistic feature preprocessing (e.g., POS tagging, dependency parsing)
    pos = [(token.text, token.pos_) for token in doc]
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]

    # Return the preprocessed text and linguistic features
    return pos, dependencies

# Apply the preprocessing function to the text column
data[["pos", "dependencies"]] = data["Review"].apply(preprocess_text).apply(pd.Series)

# Save the preprocessed data to a new CSV file
data.to_csv("ling_pp_data.csv", index=False)


In [26]:
data2 = pd.read_csv("ling_pp_data.csv")
data2.head()

Unnamed: 0,Review,Liked,pos_tags,dependencies,pos
0,wow ... love this place .,1,Empty DataFrame\nColumns: []\nIndex: [],"[('wow', 'ROOT', 'wow'), ('...', 'punct', 'wow...","[('wow', 'INTJ'), ('...', 'PUNCT'), ('love', '..."
1,crust be not good .,0,Empty DataFrame\nColumns: []\nIndex: [],"[('crust', 'nsubj', 'be'), ('be', 'ROOT', 'be'...","[('crust', 'NOUN'), ('be', 'AUX'), ('not', 'PA..."
2,not tasty and the texture be just nasty .,0,Empty DataFrame\nColumns: []\nIndex: [],"[('not', 'neg', 'tasty'), ('tasty', 'ROOT', 't...","[('not', 'PART'), ('tasty', 'ADJ'), ('and', 'C..."
3,stop by during the late May bank holiday off R...,1,Empty DataFrame\nColumns: []\nIndex: [],"[('stop', 'ROOT', 'stop'), ('by', 'prt', 'stop...","[('stop', 'VERB'), ('by', 'ADP'), ('during', '..."
4,the selection on the menu be great and so be t...,1,Empty DataFrame\nColumns: []\nIndex: [],"[('the', 'det', 'selection'), ('selection', 'n...","[('the', 'DET'), ('selection', 'NOUN'), ('on',..."


In [7]:
from sklearn.preprocessing import OneHotEncoder

In [29]:


# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder on the POS tags and transform them into a one-hot encoded representation
pos_encoded = encoder.fit_transform([[pos1] for pos1 in pos])

# Print the one-hot encoded representation
print(pos_encoded)


[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.

In [None]:
# Create a dictionary mapping each POS tag to its integer index
pos_dict = {pos_tag: index for index, pos_tag in enumerate(pos)}

# Perform integer encoding by replacing each POS tag with its integer index
pos_encoded1 = [pos_dict[pos_tag] for pos_tag in pos]

# Print the integer encoded representation
print(pos_encoded1)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 904, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 904, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 904, 220, 22

In [None]:
pos_tags = []
for text in data2['Review']:
    doc = nlp(text)
    tags = [token.pos_ for token in doc]
    pos_tags.append(tags)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

In [11]:
# One-hot encode POS tags
encoder = OneHotEncoder(sparse_output=False)
pos_encoded = encoder.fit_transform(data1)

In [13]:
print(pos_encoded)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


Model for Basic Text processing features

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(data['Review']).toarray()
#X = pd.DataFrame(X,columns=cv.get_feature_names_out())
y = data1.iloc[:, 1]

In [14]:
pos=data1['pos_tags']

KeyError: ignored

ModelforLinguistic feature

In [31]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data2['Review'])

In [32]:
from scipy.sparse import hstack
# Concatenate BoW features with POS tag encoding
concatenated_features = hstack((X, pos_encoded))
# Print the concatenated features
print(concatenated_features.toarray())

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.20,
                                                    random_state = 20)

In [None]:
# Fitting any algo to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=2022,n_estimators=25)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76       102
           1       0.74      0.79      0.76        98

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200

0.76


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features = 1500)
X = cv.fit_transform(data1['Review']).toarray()
#X = pd.DataFrame(X,columns=cv.get_feature_names_out())
y = data1.iloc[:, 1]

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.20,
                                                    random_state = 2022,
                                                    stratify=y)

In [None]:
# Fitting any algo to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=2022,n_estimators=25)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred))
####################################################

              precision    recall  f1-score   support

           0       0.74      0.84      0.79       100
           1       0.81      0.70      0.75       100

    accuracy                           0.77       200
   macro avg       0.78      0.77      0.77       200
weighted avg       0.78      0.77      0.77       200

0.77
