In [None]:
import pandas as pd
import re
import string

# List of stop words
stopwords = set("""
i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it
its itself they them their theirs themselves what which who whom this that these those am is are was were be been
being have has had having do does did doing a an the and but if or because as until while of at by for with about
against between into through during before after above below to from up down in out on off over under again further
then once here there when where why how all any both each few more most other some such no nor not only own same so
than too very s t can will just don should now
""".split())


def custom_tokenizer(text):
    text = str(text).lower()

    # Normalize repeated characters
    def normalize_repeat(match):
        char = match.group(1)
        repeat_len = len(match.group(0))
        return f"{char} <REPEAT:{repeat_len}>"

    text = re.sub(r"(.)\1{2,}", normalize_repeat, text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Split into words
    tokens = text.split()

    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]

    return tokens

# --- POS Tagger ---
def enhanced_pos_tagger(tokens):
    pos_tags = []
    prepositions = {'in', 'on', 'at', 'by', 'to', 'with', 'from', 'about', 'into', 'over', 'under'}
    conjunctions = {'and', 'but', 'or', 'so', 'because', 'although', 'if', 'while'}

    for word in tokens:
        if word in prepositions:
            tag = "PREPOSITION"
        elif word in conjunctions:
            tag = "CONJUNCTION"
        elif word.endswith("ing") or word.endswith("ed"):
            tag = "VERB"
        elif word.endswith("ly"):
            tag = "ADVERB"
        elif word.endswith("ful") or word.endswith("ous") or word.endswith("able"):
            tag = "ADJECTIVE"
        elif word.endswith("tion") or word.endswith("ness") or word.endswith("ment"):
            tag = "NOUN"
        else:
            tag = "OTHER"
        pos_tags.append((word, tag))
    return pos_tags

# --- Lemmatizer ---
def lemmatizer(pos_tagged_tokens):
    lemmas = []
    custom_lemmas = {
        "beautiful": "beauty", "hopeful": "hope", "joyful": "joy", "useful": "use", "careful": "care"
    }

    for word, tag in pos_tagged_tokens:
        if tag in ["PREPOSITION", "CONJUNCTION"]:
            continue

        if word in custom_lemmas:
            lemma = custom_lemmas[word]
        else:
            lemma = word
            if tag == "VERB":
                if word.endswith("ing"):
                    lemma = word[:-3]
                elif word.endswith("ed"):
                    lemma = word[:-2]
            elif tag == "ADJECTIVE":
                if word.endswith("ful"):
                    lemma = word[:-3] + "y"
                elif word.endswith("ous"):
                    lemma = word[:-3]
                elif word.endswith("able"):
                    lemma = word[:-4]
            elif tag == "ADVERB":
                if word.endswith("ly"):
                    lemma = word[:-2] + "y"

        lemmas.append(lemma)
    return lemmas


In [None]:
# Load your CSV file
df = pd.read_csv("/content/merged_news.csv")  # Replace with your CSV filename

# Process each row
def process_text(text):
    tokens = custom_tokenizer(text)
    tagged = enhanced_pos_tagger(tokens)
    lemmas = lemmatizer(tagged)
    return " ".join(lemmas)

# Apply to the 'text' column
df["lemmas"] = df["text"].apply(process_text)

# Save result
df.to_csv("/content/merged_news_lemma.csv", index=False)
print("✅ Done! Saved as 'processed_output.csv'")


✅ Done! Saved as 'processed_output.csv'


In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy.sparse import lil_matrix, diags

# Step 1: Load CSV and tokenize lemmas
df = pd.read_csv("/content/merged_news_lemma.csv")
df["lemmas"] = df["lemmas"].astype(str).apply(lambda x: x.split())

# Step 2: Build vocabulary and mappings
vocab = sorted(set(word for doc in df["lemmas"] for word in doc))
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

# Step 3: Compute sparse TF matrix
num_docs = len(df)
vocab_size = len(vocab)
tf_matrix = lil_matrix((num_docs, vocab_size))

for i, doc in enumerate(df["lemmas"]):
    word_counts = Counter(doc)
    total_words = len(doc)
    for word, count in word_counts.items():
        if word in word_to_index:
            tf_matrix[i, word_to_index[word]] = count / total_words

# 📊 Step 4: Compute IDF vector
df_vector = np.zeros(vocab_size)
for word, idx in word_to_index.items():
    df_vector[idx] = sum(1 for doc in df["lemmas"] if word in doc)

idf_vector = np.log((num_docs + 1) / (df_vector + 1)) + 1  # smoothed IDF
idf_diag = diags(idf_vector)

# 🧠 Step 5: Compute TF-IDF matrix
tfidf_matrix = tf_matrix @ idf_diag  # Sparse matrix multiplication

# 📤 Step 6: Convert to dense DataFrame (only if small enough)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vocab)
result_df = pd.concat([df[["title", "label"]].reset_index(drop=True), tfidf_df], axis=1)

# 💾 Step 7: Save to CSV
result_df.to_csv(r"/content/tf-idf-merged_news.csv", index=False)
print("TF-IDF features saved in 'tf-idf_lemmas.csv'")


KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load CSV and prepare data
df = pd.read_csv("/content/merged_news_lemma.csv")

# Ensure lemmas column is tokenized list; join for vectorizer
df["lemmas"] = df["lemmas"].astype(str).apply(lambda x: x.split())
df["joined_lemmas"] = df["lemmas"].apply(lambda x: " ".join(x))

# Step 2: TF-IDF Vectorization with max_features to limit memory usage
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x.split(),  # Already tokenized
    lowercase=False,
    norm='l2',
    smooth_idf=True,
    sublinear_tf=False,
    max_features=10000  # Limit to top 10,000 words
)

tfidf_matrix = vectorizer.fit_transform(df["joined_lemmas"])

# Step 3: Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Step 4: Combine with title and label
result_df = pd.concat([df[["title", "label"]].reset_index(drop=True), tfidf_df], axis=1)

# Step 5: Save to CSV
result_df.to_csv("/content/tf-idf-merged_news.csv", index=False)
print(" TF-IDF features saved using Scikit-learn (limited to 10,000 words)")




✅ TF-IDF features saved using Scikit-learn (limited to 10,000 words)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Load data
df = pd.read_csv("/content/tf-idf-merged_news.csv")

# Step 2: Features and target
X = df.drop(columns=['title', 'label'])
y = df['label']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Optional: scale features (TF-IDF is already normalized but this is safe)
scaler = StandardScaler(with_mean=False)  # with_mean=False to avoid error on sparse TF-IDF
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train_scaled, y_train)

# Step 5: Evaluate
y_pred = lr.predict(X_test_scaled)

print("🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred, digits=4))
print("🔹 F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("🔹 Accuracy:", accuracy_score(y_test, y_pred))
