In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [None]:
# Download stopwords
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    return ' '.join(text)

### PART 1: SENTIMENT ANALYSIS WITH NAIVE BAYES ###
# Load IMDB Dataset (Assumed CSV format)
df = pd.read_csv("/content/drive/MyDrive/Dataset/IMDB Dataset.csv")
df['clean_text'] = df['review'].apply(preprocess_text)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorization
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

# Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Evaluate Model
y_pred = nb_model.predict(X_test)
print("Naive Bayes Sentiment Analysis Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, nb_model.predict_proba(X_test)[:, 1]))

### PART 2: FEATURE SELECTION WITH RFE ###
# Load Breast Cancer Dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
model = LogisticRegression(max_iter=200)

# Apply Recursive Feature Elimination (RFE)
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X_train, y_train)

# Selected Features
selected_features = rfe.support_
ranking = rfe.ranking_

# Transform Data using Selected Features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Retrain Model on Selected Features
model.fit(X_train_rfe, y_train)
y_pred_rfe = model.predict(X_test_rfe)

# Evaluate Model Performance
print("\nFeature Selection with RFE Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rfe))
print("Classification Report:\n", classification_report(y_test, y_pred_rfe))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rfe))

### PART 3: PIPELINE IMPLEMENTATION ###
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', RFE(estimator=LogisticRegression(max_iter=200), n_features_to_select=5)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model evaluation
y_pred_grid = grid_search.best_estimator_.predict(X_test)
print("\nPipeline with Feature Selection Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print("Classification Report:\n", classification_report(y_test, y_pred_grid))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_grid))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Download stopwords
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    return ' '.join(text)

### PART 1: SENTIMENT ANALYSIS WITH NAIVE BAYES ###
# Load IMDB Dataset (Assumed CSV format)
df = pd.read_csv("/content/drive/MyDrive/Dataset/IMDB Dataset.csv")
df['clean_text'] = df['review'].apply(preprocess_text)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorization
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

# Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Evaluate Model
y_pred = nb_model.predict(X_test)
print("Naive Bayes Sentiment Analysis Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, nb_model.predict_proba(X_test)[:, 1]))

### PART 2: FEATURE SELECTION ###
# Load Breast Cancer Dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Univariate Feature Selection
univariate_selector = SelectKBest(score_func=f_classif, k=5)
X_train_uni = univariate_selector.fit_transform(X_train, y_train)
X_test_uni = univariate_selector.transform(X_test)

# Wrapper Method (RFE)
model = LogisticRegression(max_iter=200)
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X_train, y_train)
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Embedded Method (Lasso)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
selected_features_embedded = np.where(lasso.coef_ != 0)[0]
X_train_embedded = X_train[:, selected_features_embedded]
X_test_embedded = X_test[:, selected_features_embedded]

# Model Training and Evaluation
model.fit(X_train_rfe, y_train)
y_pred_rfe = model.predict(X_test_rfe)
print("\nFeature Selection with RFE Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rfe))

### PART 3: NAIVE BAYES FROM SCRATCH ###
def train_naive_bayes(X, y):
    vocab = set()
    class_counts = {0: 0, 1: 0}
    word_counts = {0: {}, 1: {}}

    for text, label in zip(X, y):
        words = text.split()
        class_counts[label] += 1
        for word in words:
            vocab.add(word)
            if word not in word_counts[label]:
                word_counts[label][word] = 0
            word_counts[label][word] += 1
    return vocab, class_counts, word_counts

# Train Naive Bayes Model from Scratch
vocab, class_counts, word_counts = train_naive_bayes(df['clean_text'], df['sentiment'])

print("Naive Bayes from Scratch Model Trained Successfully!")
