1. Load the dataset and preprocess the reviews.
a. Convert all text to lowercase.
b. Remove non-alphabetic characters (punctuation).
c. Tokenize the reviews and remove common stopwords.
d. Apply stemming to reduce words to their root form.

In [7]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Load and preprocess IMDB dataset
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabet characters
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    return ' '.join(text)

df = pd.read_csv('/content/drive/MyDrive/Copy of IMDB Dataset.csv')
df = df.sample(1000)
df['cleaned_text'] = df['review'].apply(preprocess_text)

# Convert text to numerical format
cv = CountVectorizer(max_features=3000)
X = cv.fit_transform(df['cleaned_text']).toarray()
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Naive Bayes Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

Naive Bayes Model Performance:
Accuracy: 0.755
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.82      0.73        82
           1       0.85      0.71      0.77       118

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.75       200
weighted avg       0.77      0.76      0.76       200

Confusion Matrix:
 [[67 15]
 [34 84]]
ROC-AUC Score: 0.7644687887556842


In [8]:
## Task 2: Feature Selection with RFE (Breast Cancer Dataset)
# # Load dataset
df_bc = pd.read_csv('/content/drive/MyDrive/Concepts and Tech. of AI/DataSet/data 2.csv')
X = df_bc.iloc[:, 2:].values  # Assuming features start from the 3rd column
y = df_bc.iloc[:, 1].values  # Assuming the second column contains labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply RFE with Logistic Regression
model = LogisticRegression(max_iter=200)
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X_train, y_train)

# Select features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train model on selected features
model.fit(X_train_rfe, y_train)
y_pred_rfe = model.predict(X_test_rfe)

# Evaluation
print("Feature Selection with RFE:")
print("Accuracy:", accuracy_score(y_test, y_pred_rfe))
print("Classification Report:\n", classification_report(y_test, y_pred_rfe))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rfe))


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values