In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# data
data = pd.DataFrame({
    'text': ["The sky is blue", "The sun is bright", "The sun in the sky is bright", "We can see the bright sun"],
    'label': [0, 1, 1, 1]  # Labels for classification
})

data


Unnamed: 0,text,label
0,The sky is blue,0
1,The sun is bright,1
2,The sun in the sky is bright,1
3,We can see the bright sun,1


In [3]:
# Text Preprocessing
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercasing
    filtered_words = [word for word in tokens if word.isalpha() and word not in stop_words]  # Remove stopwords and non-alphabetic tokens
    return ' '.join(filtered_words)

data['cleaned_text'] = data['text'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']  # Classification labels

In [4]:
# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X)
data['cluster'] = clusters

print(data[['text', 'cluster']])


                           text  cluster
0               The sky is blue        1
1             The sun is bright        0
2  The sun in the sky is bright        1
3     We can see the bright sun        0


In [5]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Classification
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Prediction
y_pred = classifier.predict(X_test)
y_pred

array([1])

In [7]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 100.00%
Confusion Matrix:
[[1]]
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



