In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Load the data
data2_chemin = 'data/spam.csv'
data2 = pd.read_csv(data2_chemin)

# Check for missing values
missing_values = data2.isnull().sum()
print("Missing Values:\n", missing_values)

# Class Imbalance Handling
class_weights = class_weight.compute_class_weight('balanced', np.unique(data2['Category']), data2['Category'])
print("Class Weights:", class_weights)


# Text Preprocessing
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Tokenization Techniques
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Stopword Removal
def remove_stopwords(text):
    return [word for word in text if word.lower() not in stop_words]

# Stemming vs. Lemmatization
def stem_text(text):
    return [stemmer.stem(word) for word in text]

def lemmatize_text(text):
    return [lemmatizer.lemmatize(word) for word in text]

# Feature Engineering
# Additional Feature Creation
data2['message_length'] = data2['Message'].apply(len)

# Prepare features and target
X = data2[['Message', 'message_length']]
y = data2['Category'].apply(lambda x: 1 if x == 'spam' else 0)

# Feature Extraction
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X['Message'])

# Combine additional features with TF-IDF features
X_combined = np.hstack((X_tfidf.toarray(), X[['message_length']].values))

# Naive Bayes Model Training and Evaluation using K-Fold Cross Validation
model = MultinomialNB()

pipeline = Pipeline([
    ('feature_selection', SelectKBest(chi2, k=1000)),
    ('clf', model)
])

# K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_combined, y, cv=kf, scoring='accuracy')

print("Cross Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Missing Values:
 Category    0
Message     0
dtype: int64


TypeError: compute_class_weight() takes 1 positional argument but 3 were given