In [1]:
 import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import vstack,csr_matrix

In [2]:
data = pd.read_csv("/content/spam.csv", encoding="latin-1")
df = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [12]:
def split_into_two(text):
   words = str(text).split()
   middle = len(words) // 2
   return " ".join(words[:middle]), " ".join(words[middle:])
df[['view1', 'view2']] = df['text'].apply(lambda x: pd.Series(split_into_two(x)))


In [13]:
df['is_labeled'] = False
df.loc[:int(len(df) * 0.2), 'is_labeled'] = True

In [15]:
vectorizer = CountVectorizer()
vectorizer.fit(pd.concat([df['view1'], df['view2']]))
X1_labeled = vectorizer.transform(df[df.is_labeled]['view1'])
X2_labeled = vectorizer.transform(df[df.is_labeled]['view2'])
y_labeled = df[df.is_labeled]['label'].values
X1_unlabeled = vectorizer.transform(df[~df.is_labeled]['view1'])
X2_unlabeled = vectorizer.transform(df[~df.is_labeled]['view2'])
model1 = MultinomialNB()
model2 = MultinomialNB()


In [22]:
for round_num in range(3):
    print(f"\n Round {round_num + 1}")
    model1.fit(X1_labeled, y_labeled)
    model2.fit(X2_labeled, y_labeled)
    probs1 = model1.predict_proba(X1_unlabeled)
    probs2 = model2.predict_proba(X2_unlabeled)
    confident_indexes = []
    for i, (p1, p2) in enumerate(zip(probs1, probs2)):
        if max(p1) > 0.9 and max(p2) > 0.9 and np.argmax(p1) == np.argmax(p2):
            confident_indexes.append(i)


    if not confident_indexes:
        print(" No confident samples this round. Breaking.")
        break

    X1_new = X1_unlabeled[confident_indexes]
    X2_new = X2_unlabeled[confident_indexes]
    y_new = model1.predict(X1_new)


    unlabeled_indices_in_df = df[~df.is_labeled].index.tolist()

    newly_labeled_df_indices = [unlabeled_indices_in_df[i] for i in confident_indexes]


    X1_labeled = vstack([X1_labeled, X1_new])
    X2_labeled = vstack([X2_labeled, X2_new])
    y_labeled = np.concatenate([y_labeled, y_new])


    keep_mask = np.ones(X1_unlabeled.shape[0], dtype=bool)
    keep_mask[confident_indexes] = False

    X1_unlabeled = X1_unlabeled[keep_mask]
    X2_unlabeled = X2_unlabeled[keep_mask]


    df.loc[newly_labeled_df_indices, 'is_labeled'] = True


 Round 1

 Round 2

 Round 3


In [26]:

new_headline = "Technology drives future"
v1, v2 =split_into_two(new_headline)
x1 = vectorizer.transform([v1])
x2 = vectorizer.transform([v2])
p1 = model1.predict(x1)[0]
p2 = model2.predict(x2)[0]

print("\n Final Prediction:")
print("Model 1 says:", p1)
print("Model 2 says:", p2)


 Final Prediction:
Model 1 says: 0
Model 2 says: 0
