In [None]:
# Task 01

'''
The emails dataset was loaded and the text was cleaned by converting to lowercase and removing non-alphabetic characters. 
The cleaned text was transformed into TF-IDF features using TfidfVectorizer. 
The data was split into training and testing sets. 
A Multinomial Naive Bayes model was trained on the training data. 
The model was then used to predict spam or ham on the test set, and its performance was evaluated using accuracy, classification report, and confusion matrix.
'''


import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


df =pd.read_csv('emails.csv')
df.head()
df.columns


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text


df['text'] = df['text'].apply(preprocess_text)

print(df['text'].head())


#  TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Convert the email subjects (text) into TF-IDF features
X = vectorizer.fit_transform(df['text'])

# Labels: 1 for spam, 0 for ham
y = df['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

model = MultinomialNB()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
# Task 02

'''
We loaded the Amazon reviews dataset and removed rows with missing text. 
Each reviewâ€™s rating was mapped to a sentiment label: positive (1), neutral (0), or negative (-1). 
The text and sentiment labels were selected, and the data was split into training and testing sets. 
The review text was converted into numerical features using CountVectorizer with English stopwords removed. 
A Multinomial Naive Bayes model was trained on the vectorized training data. 
The model then predicted sentiments for the test set, and its performance was evaluated using accuracy and a classification report. 
Finally, an example review was classified using the trained model.
'''


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

amazon_reviews = pd.read_csv('amazon_reviews.csv')

cleaned_reviews = amazon_reviews.dropna(subset=['reviewText']).copy()

def map_sentiment(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating == 3:
        return 0  # Neutral
    else:
        return -1  # Negative

cleaned_reviews['sentiment'] = cleaned_reviews['overall'].apply(map_sentiment)


final_reviews = cleaned_reviews[['reviewText', 'sentiment']]


X = final_reviews['reviewText']
y = final_reviews['sentiment']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = CountVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)

#  predictions
y_pred = nb_model.predict(X_test_vectorized)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


example_review = ["This product is amazing, I loved it!"]
example_vectorized = vectorizer.transform(example_review)
example_prediction = nb_model.predict(example_vectorized)
print("\nExample Prediction:", example_prediction) 