#  Implementation of Machine Learning Models in NLP

# Importing libraries

In [19]:

# 1. Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset (TSV format → tab-separated values)
df = pd.read_csv(r"C:\Ds & AI ( my work)\Artificial_Intelligence _(AI)\Natural_Language_Processing_( NLP )\ML_in_NLP\Restaurant_Reviews.tsv", 
                 delimiter='\t', quoting=3)

# Display first few rows
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Text Cleaning & Preprocessing

- At this stage, we have converted unstructured text into clean, structured tokens (words).
- These will be used to create numerical features for the ML model

In [20]:

# Text Cleaning & Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(0, 1000):
    # Keep only letters
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    # Lowercase
    review = review.lower()
    # Tokenize
    review = review.split()
    # Stemming + Stopword Removal
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    # Join back into string
    review = ' '.join(review)
    corpus.append(review)

# Show few samples
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

# Feature Extraction (Bag of Words model)

- Each review → converted into a vector of word counts.
- X = independent features (word frequencies).
- y = target labels (positive/negative).

In [21]:
# Feature Extraction (Bag of Words model)
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)   # limit features for efficiency
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

# Train/Test Split

In [22]:
# Train/Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)

# Train a Machine Learning Model (Decision Tree)

In [23]:
# Train a Machine Learning Model (Decision Tree)
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train, y_train)

# Predictions

In [24]:
y_pred = classifier.predict(X_test)

# Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy:", ac)

# Bias & Variance (train vs test performance)
bias = classifier.score(X_train, y_train)
variance = classifier.score(X_test, y_test)

print("Bias (Training Score):", bias)
print("Variance (Test Score):", variance)

Confusion Matrix:
 [[71 26]
 [44 59]]
Accuracy: 0.65
Bias (Training Score): 0.99625
Variance (Test Score): 0.65


# Improving Model Accuracy
Our Decision Tree model achieved only **65.5% accuracy**, which is relatively low.

To improve performance, we will:

1. Apply multiple classification models.
2. Use the same train/test split for fair comparison.
3. Compare their accuracy and confusion matrices.
4. Tune hyperparameters where possible.\

The goal: Achieve at least **80% accuracy**


# Import different classifiers

In [25]:
# Import different classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

# Store models in dictionary
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=0, max_depth=10),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=0),
    "SVM (Linear)": SVC(kernel='linear'),
    "SVM (RBF)": SVC(kernel='rbf'),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

results = {}

# Train and evaluate each model

In [26]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    results[name] = acc
    
    print(f"\n {name}")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", cm)


 Decision Tree
Accuracy: 0.69
Confusion Matrix:
 [[94  3]
 [59 44]]

 Naive Bayes
Accuracy: 0.765
Confusion Matrix:
 [[72 25]
 [22 81]]

 Logistic Regression
Accuracy: 0.71
Confusion Matrix:
 [[76 21]
 [37 66]]

 Random Forest
Accuracy: 0.71
Confusion Matrix:
 [[86 11]
 [47 56]]

 SVM (Linear)
Accuracy: 0.72
Confusion Matrix:
 [[74 23]
 [33 70]]

 SVM (RBF)
Accuracy: 0.735
Confusion Matrix:
 [[90  7]
 [46 57]]

 KNN
Accuracy: 0.585
Confusion Matrix:
 [[70 27]
 [56 47]]


# Results Comparison

In [27]:
# Compare results
results_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
results_df = results_df.sort_values(by="Accuracy", ascending=False)
results_df

Unnamed: 0,Model,Accuracy
1,Naive Bayes,0.765
5,SVM (RBF),0.735
4,SVM (Linear),0.72
2,Logistic Regression,0.71
3,Random Forest,0.71
0,Decision Tree,0.69
6,KNN,0.585


- Our initial Decision Tree model (65.5%) improved significantly by testing other algorithms.
- Naive Bayes (76.5%) is currently the best performer.
- However, we still didn’t reach 80% accuracy.

# Build the model with TF-IDF Vectorizer

In [28]:
# Import models again
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix
# Define models
models_tfidf = {
    "Decision Tree": DecisionTreeClassifier(random_state=0, max_depth=10),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=0),
    "SVM (Linear)": SVC(kernel='linear'),
    "SVM (RBF)": SVC(kernel='rbf'),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

results_tfidf = {}

# Train and evaluate
for name, model in models_tfidf.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results_tfidf[name] = acc
    
    print(f"\n {name}")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


 Decision Tree
Accuracy: 0.69
Confusion Matrix:
 [[94  3]
 [59 44]]

 Naive Bayes
Accuracy: 0.765
Confusion Matrix:
 [[72 25]
 [22 81]]

 Logistic Regression
Accuracy: 0.71
Confusion Matrix:
 [[76 21]
 [37 66]]

 Random Forest
Accuracy: 0.71
Confusion Matrix:
 [[86 11]
 [47 56]]

 SVM (Linear)
Accuracy: 0.72
Confusion Matrix:
 [[74 23]
 [33 70]]

 SVM (RBF)
Accuracy: 0.735
Confusion Matrix:
 [[90  7]
 [46 57]]

 KNN
Accuracy: 0.585
Confusion Matrix:
 [[70 27]
 [56 47]]


In [29]:
# Compare TF-IDF results
results_df_tfidf = pd.DataFrame(list(results_tfidf.items()), columns=["Model", "Accuracy"])
results_df_tfidf = results_df_tfidf.sort_values(by="Accuracy", ascending=False)
results_df_tfidf

Unnamed: 0,Model,Accuracy
1,Naive Bayes,0.765
5,SVM (RBF),0.735
4,SVM (Linear),0.72
2,Logistic Regression,0.71
3,Random Forest,0.71
0,Decision Tree,0.69
6,KNN,0.585


# Increasing Dataset Size by Duplication
Our dataset currently has 1000 reviews.\
To experiment with a larger dataset, we can **duplicate it 3 times** (1000 → 3000 samples).\

This does not add new information, but it can help models average better during training.

In [30]:

df.shape

(1000, 2)

In [31]:
# Duplicate dataset 3 times (1000 -> 3000)
df_expanded = pd.concat([df]*3, ignore_index=True)

print("Original size:", len(df))
print("Expanded size:", len(df_expanded))

df_expanded.head()

Original size: 1000
Expanded size: 3000


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Experiment: Apply All ML Algorithms with TF-IDF on Expanded Dataset

- Preprocess the expanded dataset.
- Convert reviews into TF-IDF features.
- Train multiple ML classifiers.
- Compare their accuracy results.


In [32]:
# 1. Expand dataset (duplicate 3 times)
df_expanded = pd.concat([df]*3, ignore_index=True)

print("Original size:", len(df))
print("Expanded size:", len(df_expanded))

Original size: 1000
Expanded size: 3000


In [33]:
# 2. Text Cleaning & Preprocessing on expanded dataset
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus_expanded = []

ps = PorterStemmer()
for i in range(len(df_expanded)):
    review = re.sub('[^a-zA-Z]', ' ', df_expanded['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_expanded.append(review)

In [34]:
# 3. TF-IDF Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))  # using unigrams + bigrams
X = tfidf.fit_transform(corpus_expanded).toarray()
y = df_expanded.iloc[:, 1].values

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0
)

In [35]:
# 4. Train Multiple ML Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=0),
    "SVM (Linear)": SVC(kernel='linear'),
    "SVM (RBF)": SVC(kernel='rbf'),
    "Decision Tree": DecisionTreeClassifier(max_depth=20, random_state=0),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

results_expanded = {}

# Train & evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results_expanded[name] = acc
    print(f"\n {name}")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


 Naive Bayes
Accuracy: 0.9383333333333334
Confusion Matrix:
 [[269  18]
 [ 19 294]]

 Logistic Regression
Accuracy: 0.9566666666666667
Confusion Matrix:
 [[280   7]
 [ 19 294]]

 Random Forest
Accuracy: 0.985
Confusion Matrix:
 [[283   4]
 [  5 308]]

 SVM (Linear)
Accuracy: 0.9683333333333334
Confusion Matrix:
 [[280   7]
 [ 12 301]]

 SVM (RBF)
Accuracy: 0.9833333333333333
Confusion Matrix:
 [[283   4]
 [  6 307]]

 Decision Tree
Accuracy: 0.795
Confusion Matrix:
 [[286   1]
 [122 191]]

 KNN
Accuracy: 0.5633333333333334
Confusion Matrix:
 [[284   3]
 [259  54]]


In [36]:
# 5. Compare Results
results_df_expanded = pd.DataFrame(list(results_expanded.items()), columns=["Model", "Accuracy"])
results_df_expanded = results_df_expanded.sort_values(by="Accuracy", ascending=False)
results_df_expanded

Unnamed: 0,Model,Accuracy
2,Random Forest,0.985
4,SVM (RBF),0.983333
3,SVM (Linear),0.968333
1,Logistic Regression,0.956667
0,Naive Bayes,0.938333
5,Decision Tree,0.795
6,KNN,0.563333




- Switching to TF-IDF with bigrams and expanding the dataset gave a huge accuracy boost (from ~76% → ~98%).
- For practical use, SVM (RBF) and Random Forest are the most reliable.
- Logistic Regression and Naive Bayes remain excellent fast baselines.