In [1]:
# Install required libraries
!pip install scikit-learn
!pip install nltk

# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from google.colab import files

# Download NLTK stopwords
nltk.download('stopwords')

# Upload dataset file
print("Please upload 'train.csv'")
uploaded = files.upload()

# Load the dataset
df = pd.read_csv('train.csv')
print(f"Dataset loaded with shape: {df.shape}")
print(df.head())

# Fill missing values
df = df.fillna('')

# Combine title and text
df['content'] = df['title'] + " " + df['text']

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Apply cleaning
df['content'] = df['content'].apply(clean_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

df['content'] = df['content'].apply(remove_stopwords)

print("\nSample preprocessed data:")
print(df[['content', 'label']].head())

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['content'])
y = df['label']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes model
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)

print("\nNaive Bayes Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

# Train Logistic Regression model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

print("\nLogistic Regression Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

print("\nAll done! You can now test your models or modify the pipeline.")


Please upload 'train.csv'


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving train.csv to train.csv
Dataset loaded with shape: (6, 5)
   id                                   title      author  \
0   1        Scientists confirm water on Mars   Dr. Smith   
1   2   Breaking: Aliens have landed on Earth    John Doe   
2   3    New vaccine reduces flu cases by 70%    Jane Roe   
3   4  Government secretly controls your mind   Anonymous   
4   5  Local man wins lottery twice this week  Mike Brown   

                                                text  label  
0  New research confirms the presence of water on...      0  
1  An anonymous source reports that aliens have l...      1  
2  Studies show the new vaccine is highly effective.      0  
3  It's proven that the government uses waves to ...      1  
4  A local man has defied odds and won the lotter...      0  

Sample preprocessed data:
                                             content  label
0  scientists confirm water mars new research con...      0
1  breaking aliens landed earth anonymous source .