In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string


In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    
    return processed_text



In [None]:
# Load your dataset
df = pd.read_csv('PubMed_200k_RCT_Numbers_replaced_with__train.csv')

In [None]:
# Preprocess the text data
df['processed_text'] = df['text'].apply(preprocess_text)

In [None]:
columns_to_drop = ['Unnamed: 0','text', 'line_number', 'total_lines']
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Save the preprocessed data
df.to_csv('preprocessed_dataset.csv', index=False)