Importing and formatting dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from ast import literal_eval

In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/NER_Dataset.csv")

In [None]:
df.head()

In [None]:
# Convert string representation of lists to actual lists
df['Word'] = df['Word'].apply(literal_eval)
df['POS'] = df['POS'].apply(literal_eval)
df['Tag'] = df['Tag'].apply(literal_eval)

df.head()

In [None]:
text_from_df = []
for index, row in df.head(10000).iterrows():
  text = ""
  n = len(row['Word'])
  for i in range(0, n-1):
    text = text + row['Word'][i] + " "
  text = text + row['Word'][-1]
  # print(f"Length of Text {i} : {n}. Length of POS {i} : {len(row['POS'])}")
  text_from_df.append(text)

In [None]:
for i in range(10):
  print(f"Text {i} : {text_from_df[i]}")

In [None]:
for index, row in df.head(10).iterrows():
  print(f"{row['POS']}")

NER Tagging Using SPACY

In [None]:
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex

import seaborn as sns
import matplotlib.pyplot as plt

# Load the English NER model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define the tag mapping
tag_mapping = {
    "B-art": "ART",
    "B-eve": "EVENT",
    "B-geo": "GPE",
    "B-gpe": "GPE",
    "B-nat": "NORP",
    "B-org": "ORG",
    "B-per": "PERSON",
    "B-tim": "TIME",
    "I-art": "ART",
    "I-eve": "EVENT",
    "I-geo": "GPE",
    "I-gpe": "GPE",
    "I-nat": "NORP",
    "I-org": "ORG",
    "I-per": "PERSON",
    "I-tim": "TIME",
    "O": ""
}

In [None]:
import re

# Function to preprocess words
def preprocess_words(words):
    processed_words = []
    for word in words:
        # Replace hyphens in words with an empty string
        processed_word = re.sub(r'[-~\']', '', word)
        processed_words.append(processed_word)
    return processed_words

In [None]:
# Number of records in the dataset
n = 10

for i in range(n):
    # Preprocess words before joining
    processed_words = preprocess_words(df['Word'][i])
    text = ' '.join(processed_words)
    doc = nlp(text)
    pos_tags = [token.tag_ for token in doc]
    if len(df['Word'][i]) == len(pos_tags):
        displacy.render(doc, style="dep", jupyter=True, options={'distance':100})
        # Evaluate NER Tags
        displacy.render(doc, style="ent", jupyter=True)

In [None]:
# Number of records in the dataset
n = len(df)

# Process each sentence and evaluate POS and NER
pos_predictions = []
ner_predictions = []
pos_true = []
ner_true = []

for i in range(n):
    # Preprocess words before joining
    processed_words = preprocess_words(df['Word'][i])
    text = ' '.join(processed_words)
    doc = nlp(text)
    pos_tags = [token.tag_ for token in doc]
    if len(df['Word'][i]) == len(pos_tags):
        pos_predictions.extend(pos_tags)
        pos_true.extend(df['POS'][i])
        # Evaluate NER Tags
        ner_tags = [alpha.ent_type_ for alpha in doc]
        ner_predictions.extend(ner_tags)
        ner_true.extend(df['Tag'][i])


In [None]:
# Map tags using the dictionary
mapped_ner_true = [tag_mapping[tag] for tag in ner_true]

In [None]:
# Evaluate and print POS classification report
pos_report = classification_report(pos_true, pos_predictions)
print("POS Tagging Classification Report:")
print(pos_report)

In [None]:
# Evaluate and print Tag classification report
tag_report = classification_report(mapped_ner_true, ner_predictions)
print("Tag Classification Report:")
print(tag_report)

In [None]:
# Calculate accuracy
accuracy_pos = accuracy_score(pos_true, pos_predictions)

# Calculate F1 score
f1_pos = f1_score(pos_true, pos_predictions, average='weighted')

In [None]:
# Calculate accuracy
accuracy_tag = accuracy_score(mapped_ner_true, ner_predictions)

# Calculate F1 score
f1_tag = f1_score(mapped_ner_true, ner_predictions, average='weighted')

In [None]:
print(f"Accuracy of Spacy model on pos is {accuracy_pos}")
print(f"F1 Score of Spacy model on pos is {f1_pos}")

print(f"Accuracy of Spacy model on tag is {accuracy_tag}")
print(f"F1 Score of Spacy model on tag is {f1_tag}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [None]:
# Create a confusion matrix
conf_matrix_pos = confusion_matrix(pos_true, pos_predictions)

# Print the confusion matrix
print("Confusion Matrix for Spacy model on pos is :")
print(conf_matrix_pos)

In [None]:
# Create a confusion matrix
conf_matrix_tag = confusion_matrix(mapped_ner_true, ner_predictions)

# Print the confusion matrix
print("Confusion Matrix for Spacy model on tag is :")
print(conf_matrix_tag)

NER Tagging Using NLTK

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk import pos_tag, word_tokenize, ne_chunk
from nltk.tree import Tree

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')

In [None]:
# Original to NLTK Tags Mapping
tag_mapping_nltk = {
    "B-art": "O",  # Assuming no direct equivalent for B-art in NLTK tags
    "B-eve": "O",  # Assuming no direct equivalent for B-eve in NLTK tags
    "B-geo": "LOCATION",
    "B-gpe": "GPE",
    "B-nat": "O",  # Assuming no direct equivalent for B-nat in NLTK tags
    "B-org": "ORGANIZATION",
    "B-per": "PERSON",
    "B-tim": "TIME",
    "I-art": "O",  # Assuming no direct equivalent for I-art in NLTK tags
    "I-eve": "O",  # Assuming no direct equivalent for I-eve in NLTK tags
    "I-geo": "LOCATION",
    "I-gpe": "GPE",
    "I-nat": "O",  # Assuming no direct equivalent for I-nat in NLTK tags
    "I-org": "ORGANIZATION",
    "I-per": "PERSON",
    "I-tim": "TIME",
    "O": "O"
}

In [None]:
# Number of records in the dataset
n = 10

# Function to convert NLTK tree structure to flat list
def extract_ner_tags(tree):
    ner_tags = []
    for subtree in tree:
        if type(subtree) == Tree:
            ner_tags.append(f"{subtree.label()}-{subtree[0][0]}")
        else:
            ner_tags.append("O")  # Outside named entities
    return ner_tags


for i in range(n):
    # Preprocess words before joining
    processed_words = preprocess_words(df['Word'][i])
    text = ' '.join(processed_words)
    tokens = word_tokenize(text)

    # Predict POS tags
    pos_tags = pos_tag(tokens)

    if len(df['Word'][i]) == len(pos_tags):
        print(pos_tags)

        # Predict NER tags using NLTK's ne_chunk
        chunked = ne_chunk(pos_tags)
        ner_tags = extract_ner_tags(chunked)
        print(ner_tags)

In [None]:
# Number of records in the dataset
n = len(df)

# Function to convert NLTK tree structure to flat list
def extract_ner_tags(tree):
    ner_tags = []
    for subtree in tree:
        if type(subtree) == Tree:
            ner_tags.append(f"{subtree.label()}-{subtree[0][1]}")
        else:
            ner_tags.append("O")  # Outside named entities
    return ner_tags

# Initialize lists to store the true and predicted POS and NER tags
pos_predictions_nltk = []
ner_predictions_nltk = []
pos_true_nltk = []
ner_true_nltk = []

for i in range(n):
    # Preprocess words before joining
    processed_words = preprocess_words(df['Word'][i])
    text = ' '.join(processed_words)
    tokens = word_tokenize(text)

    # Predict POS tags
    pos_tags = pos_tag(tokens)

    if len(df['Word'][i]) == len(pos_tags):
        pos_predictions_nltk.extend([tag for _, tag in pos_tags])
        pos_true_nltk.extend(df['POS'][i])

        # Predict NER tags using NLTK's ne_chunk
        chunked = ne_chunk(pos_tags)
        ner_tags = extract_ner_tags(chunked)

        # Flatten the nested structure
        ner_tags_flat = [tag if "-" not in tag else tag.split("-")[0] for tag in ner_tags]
        if len(ner_tags_flat) == len(df['Tag'][i]):
            ner_predictions_nltk.extend(ner_tags_flat)
            ner_true_nltk.extend(df['Tag'][i])

In [None]:
# Map tags using the dictionary
mapped_ner_true_nltk = [tag_mapping_nltk[tag] for tag in ner_true_nltk]

In [None]:
# Evaluate and print POS classification report
pos_report_nltk = classification_report(pos_true_nltk, pos_predictions_nltk)
print("POS Tagging Classification Report for nltk:")
print(pos_report_nltk)

In [None]:
# Evaluate and print Tag classification report
tag_report_nltk = classification_report(mapped_ner_true_nltk, ner_predictions_nltk)
print("Tag Classification Report for nltk:")
print(tag_report_nltk)

In [None]:
# Calculate accuracy
accuracy_pos_nltk = accuracy_score(pos_true_nltk, pos_predictions_nltk)
# Calculate F1 score
f1_pos_nltk = f1_score(pos_true_nltk, pos_predictions_nltk, average='weighted')

# Calculate accuracy
accuracy_tag_nltk = accuracy_score(mapped_ner_true_nltk, ner_predictions_nltk)
# Calculate F1 score
f1_tag_nltk = f1_score(mapped_ner_true_nltk, ner_predictions_nltk, average='weighted')

print(f"Accuracy of NLTK model on POS is {accuracy_pos_nltk}")
print(f"F1 Score of NLTK model on POS is {f1_pos_nltk}")
print(f"Accuracy of NLTK model on tag is {accuracy_tag_nltk}")
print(f"F1 Score of NLTK model on tag is {f1_tag_nltk}")

In [None]:
# Create a confusion matrix
conf_matrix_pos_nltk = confusion_matrix(pos_true_nltk, pos_predictions_nltk)

# Print the confusion matrix
print("Confusion Matrix for nltk model on pos is :")
print(conf_matrix_pos_nltk)

In [None]:
# Create a confusion matrix
conf_matrix_tag_nltk = confusion_matrix(mapped_ner_true_nltk, ner_predictions_nltk)

# Print the confusion matrix
print("Confusion Matrix for nltk model on tag is :")
print(conf_matrix_tag_nltk)

Importing and formatting dataset(for sentiment analysis)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import nltk

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

In [None]:
nltk.download('punkt')

In [None]:
import json
import gzip
import random
import pandas as pd
from google.colab import files
from google.colab import drive

In [None]:
drive.mount('/content/drive')

In [None]:
import json
import gzip
import random
import pandas as pd
from google.colab import files

# Upload the gzip compressed JSON files
uploaded_files = files.upload()

# List of file names
file_names = list(uploaded_files.keys())

# Function to load and process the data
def load_and_process_data(file_name):
    with gzip.open(file_name, 'rt') as file:
        reviews = [json.loads(line) for line in file]

    return reviews

# Load and process each file
datasets = []
for file_name in file_names:
    dataset = load_and_process_data(file_name)
    datasets.append(dataset)

# Combine all datasets into one
combined_dataset = [review for dataset in datasets for review in dataset]

# Create a DataFrame from the combined dataset
df = pd.DataFrame(combined_dataset)

df.head()

In [None]:
# Map overall ratings to binary classes
df['sentiment'] = df['overall'].apply(lambda x: 'negative' if x <= 2 else 'positive')

# Select only the desired columns
df_1 = df[['reviewText', 'summary', 'overall']]

# Display the modified DataFrame
df_1

In [None]:
# Remove rows with missing values in 'reviewText' or 'summary'
df_2 = df_1.dropna()
df_2.to_csv('df_2.csv', index=False)
import pandas as pd
df_2=pd.read_csv("/content/drive/MyDrive/df_2.csv")
df_2

In [None]:
from sklearn.model_selection import train_test_split

X = df_2[['reviewText', 'summary']]
y = df_2['overall']

# Define the train-val split with stratification
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

# Initialize PorterStemmer
porter = PorterStemmer()

def preprocess_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ''

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and apply stemming
    stop_words = set(stopwords.words('english'))
    tokens = [porter.stem(word) for word in tokens if word not in stop_words]

    return ' '.join(tokens)

# Apply preprocessing to both reviewText and summary
X_train['reviewText'] = X_train['reviewText'].apply(preprocess_text)
X_train['summary'] = X_train['summary'].apply(preprocess_text)

X_val['reviewText'] = X_val['reviewText'].apply(preprocess_text)
X_val['summary'] = X_val['summary'].apply(preprocess_text)

In [None]:
X_train_dr = X_train.drop('summary', axis=1)
X_val_dr = X_val.drop('summary', axis=1)

In [None]:
import matplotlib.pyplot as plt

class_counts = df_2['overall'].value_counts()

# Plotting the bar plot
plt.figure(figsize=(10, 6))
class_counts.sort_index().plot(kind='bar', color='skyblue')
plt.title('Distribution of Classes in df_2')
plt.xlabel('Overall Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# Convert 'overall' to binary (1 for positive, 0 for negative)
df_2['sentiment'] = df_2['overall'].apply(lambda x: 1 if x >= 3 else 0)

# Count the occurrences of each class in the 'sentiment' column
sentiment_counts = df_2['sentiment'].value_counts()

# Plotting the bar plot for binary classification
plt.figure(figsize=(6, 4))
sentiment_counts.sort_index().plot(kind='bar', color='salmon')
plt.title('Distribution of Sentiments in df_2 (Binary)')
plt.xlabel('Sentiment (0: Negative, 1: Positive)')
plt.ylabel('Count')
plt.show()

Amazon Review Sentiment Analysis using CountVectorizer

In [None]:
# Evaluate performance
def evaluate_performance(y_true, y_pred, classifier_name):
    accuracy = accuracy_score(y_true, y_pred)
    confusion = confusion_matrix(y_true, y_pred)
    classification_rep = classification_report(y_true, y_pred)
    F1_score = f1_score(y_true, y_pred, average='weighted')

    print(f"Performance for {classifier_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1_score: {F1_score}")
    print("Confusion Matrix:")
    print(confusion)
    print("Classification Report:")
    print(classification_rep)
    print("\n")

In [None]:
vectorizer = CountVectorizer(max_features=5000)

X_train_bow = vectorizer.fit_transform(X_train['reviewText'] + ' ' + X_train['summary'])
X_val_bow = vectorizer.transform(X_val['reviewText'] + ' ' + X_val['summary'])

In [None]:
vectorizer = CountVectorizer(max_features=5000)

X_train_bow_dr = vectorizer.fit_transform(X_train['reviewText'])
X_val_bow_dr = vectorizer.transform(X_val['reviewText'])

In [None]:
# Initialize classifiers
nb_classifier = MultinomialNB()
gnb_classifier = GaussianNB()
dt_classifier_entropy = DecisionTreeClassifier(criterion='entropy')
dt_classifier_gini = DecisionTreeClassifier(criterion='gini')
rf_classifier_20 = RandomForestClassifier(n_estimators=20)
rf_classifier_50 = RandomForestClassifier(n_estimators=50)
rf_classifier_100 = RandomForestClassifier(n_estimators=100)

In [None]:
# Train classifiers
nb_classifier.fit(X_train_bow, y_train)
gnb_classifier.fit(X_train_bow.toarray(), y_train)  # GaussianNB expects dense matrix
dt_classifier_entropy.fit(X_train_bow, y_train)
dt_classifier_gini.fit(X_train_bow, y_train)

In [None]:
# Predictions
nb_preds = nb_classifier.predict(X_val_bow)
gnb_preds = gnb_classifier.predict(X_val_bow.toarray())
dt_preds_entropy = dt_classifier_entropy.predict(X_val_bow)
dt_preds_gini = dt_classifier_gini.predict(X_val_bow)

In [None]:
# Evaluate each classifier
evaluate_performance(y_val, nb_preds, 'Naive Bayes')
evaluate_performance(y_val, gnb_preds, 'Gaussian Naive Bayes')
evaluate_performance(y_val, dt_preds_entropy, 'Decision Tree (Entropy)')
evaluate_performance(y_val, dt_preds_gini, 'Decision Tree (Gini)')

In [None]:
# Train classifiers
rf_classifier_20.fit(X_train_bow, y_train)

# Predictions
rf_preds_20 = rf_classifier_20.predict(X_val_bow)

# Evaluate each classifier
evaluate_performance(y_val, rf_preds_20, 'Random Forest (20 trees)')

In [None]:
import joblib
rf_classifier_50_dr = RandomForestClassifier(n_estimators=50)

In [None]:
# Train classifiers
rf_classifier_50.fit(X_train_bow, y_train)

# Save the trained model
joblib.dump(rf_classifier_50, 'rf_classifier_50.pkl')

# Predictions
rf_preds_50 = rf_classifier_50.predict(X_val_bow)

# Evaluate each classifier
evaluate_performance(y_val, rf_preds_50, 'Random Forest (50 trees)')

In [None]:
# Train classifiers
rf_classifier_50_dr.fit(X_train_bow_dr, y_train)

# Save the trained model
joblib.dump(rf_classifier_50_dr, 'rf_classifier_50_dr.pkl')

# Predictions
rf_preds_50_dr = rf_classifier_50_dr.predict(X_val_bow_dr)

# Evaluate each classifier
evaluate_performance(y_val, rf_preds_50_dr, 'Random Forest (50 trees)')

In [None]:
# Load the saved model
loaded_model = joblib.load('rf_classifier_50_dr.pkl')
def generate_predictions(test_file):
    # Load the test data
    test_df = pd.read_csv(test_file)

    test = test_df.copy()

    # Preprocess the text data
    test['review_text'] = test['review_text'].apply(preprocess_text)

    # Assuming X_test is the feature column where the reviews are stored
    # X_test = test
    X_test = vectorizer.transform(test['review_text'])

    # Generate predictions
    predictions = loaded_model.predict(X_test)

    # Add predictions to the DataFrame
    test_df['result'] = predictions

    # Save the DataFrame to a new CSV file
    output_filename = 'Group_17_cv.csv'  # Replace {Group ID} with your group ID
    test_df.to_csv(output_filename, index=False, columns=['review_text', 'result'])

# Replace 'test_file.csv' with the path to your test CSV file
generate_predictions('Group_17.csv')

In [None]:
# Load the test data
test_df = pd.read_csv('Group_17.csv')

# Preprocess the text data
test_df['review'] = test_df['review'].apply(preprocess_text)

# Assuming X_test is the feature column where the reviews are stored
X_test = test_df
vectorizer = CountVectorizer(max_features=5000)
X_test

In [None]:
# Train classifiers
rf_classifier_100.fit(X_train_bow, y_train)

# Predictions
rf_preds_100 = rf_classifier_100.predict(X_val_bow)

# Evaluate each classifier
evaluate_performance(y_val, rf_preds_100, 'Random Forest (100 trees)')

In [None]:
class_labels = [1, 2, 3, 4, 5]

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, nb_preds)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Multiclass Classification Naive Bayes')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, gnb_preds)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Multiclass Classification Gaussian Naive Bayes')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, dt_preds_entropy)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Multiclass Classification Decision Tree Entropy')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, dt_preds_gini)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Multiclass Classification Decision Tree Gini')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, rf_preds_20)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Multiclass Classification Random Forest 20')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, rf_preds_50)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Multiclass Classification Random Forest 50')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, rf_preds_100)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Multiclass Classification Random Forest 100')
plt.show()

In [None]:
# Evaluate performance
def evaluate_performance(y_true, y_pred, classifier_name):
    accuracy = accuracy_score(y_true, y_pred)
    confusion = confusion_matrix(y_true, y_pred)
    classification_rep = classification_report(y_true, y_pred)
    F1_score = f1_score(y_true, y_pred)

    print(f"Performance for {classifier_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1_score: {F1_score}")
    print("Confusion Matrix:")
    print(confusion)
    print("Classification Report:")
    print(classification_rep)
    print("\n")

In [None]:
# Define the train-val split with stratification
X = df_2[['reviewText', 'summary']]
y = (df_2['overall'] > 2).astype(int)  # 1 for positive (3, 4, 5), 0 for negative (1, 2)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train['reviewText'] + ' ' + X_train['summary'])
X_val_bow = vectorizer.transform(X_val['reviewText'] + ' ' + X_val['summary'])

In [None]:
# Initialize classifiers
nb_classifier = MultinomialNB()
gnb_classifier = GaussianNB()
dt_classifier_entropy = DecisionTreeClassifier(criterion='entropy')
dt_classifier_gini = DecisionTreeClassifier(criterion='gini')
rf_classifier_20 = RandomForestClassifier(n_estimators=20)
rf_classifier_50 = RandomForestClassifier(n_estimators=50)
rf_classifier_100 = RandomForestClassifier(n_estimators=100)

In [None]:
# Train classifiers
nb_classifier.fit(X_train_bow, y_train)
dt_classifier_entropy.fit(X_train_bow, y_train)
dt_classifier_gini.fit(X_train_bow, y_train)

In [None]:
# Predictions
nb_preds = nb_classifier.predict(X_val_bow)
dt_preds_entropy = dt_classifier_entropy.predict(X_val_bow)
dt_preds_gini = dt_classifier_gini.predict(X_val_bow)

In [None]:
# Evaluate each classifier
evaluate_performance(y_val, nb_preds, 'Naive Bayes')
evaluate_performance(y_val, dt_preds_entropy, 'Decision Tree (Entropy)')
evaluate_performance(y_val, dt_preds_gini, 'Decision Tree (Gini)')

In [None]:
# Train classifiers
rf_classifier_20.fit(X_train_bow, y_train)

# Predictions
rf_preds_20 = rf_classifier_20.predict(X_val_bow)

# Evaluate each classifier
evaluate_performance(y_val, rf_preds_20, 'Random Forest (20 trees)')


In [None]:
# Train classifiers
rf_classifier_50.fit(X_train_bow, y_train)

# Predictions
rf_preds_50 = rf_classifier_50.predict(X_val_bow)

# Evaluate each classifier
evaluate_performance(y_val, rf_preds_50, 'Random Forest (50 trees)')

In [None]:
# Train classifiers
rf_classifier_100.fit(X_train_bow, y_train)

# Predictions
rf_preds_100 = rf_classifier_100.predict(X_val_bow)

# Evaluate each classifier
evaluate_performance(y_val, rf_preds_100, 'Random Forest (100 trees)')


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, nb_preds)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Binary Classification Naive Bayes')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, dt_preds_entropy)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Binary Classification Decision Tree Entropy')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, dt_preds_gini)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Binary Classification Decision Tree Gini')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, rf_preds_20)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Binary Classification Random Forest 20')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, rf_preds_50)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Binary Classification Random Forest 50')
plt.show()

In [None]:
# Calculate confusion matrix
conf_mat = confusion_matrix(y_val, rf_preds_100)

# Display confusion matrix with labels
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Binary Classification Random Forest 100')
plt.show()