In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
dataset = pd.read_csv('test.csv')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    # Stemming
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]
    return ' '.join(stemmed_words)

# Apply cleaning to 'article' and 'highlights' columns
tqdm.pandas()
dataset['cleaned_article'] = dataset['article'].progress_apply(clean_text)
dataset['cleaned_highlights'] = dataset['highlights'].progress_apply(clean_text)

# Select only the 'id', 'cleaned_article', and 'cleaned_highlights' columns
cleaned_dataset = dataset[['id', 'cleaned_article', 'cleaned_highlights']]

# Save the preprocessed dataset to a new CSV file
cleaned_dataset.to_csv('preprocessed_data.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shett\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shett\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shett\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|█████████████████████████████████████████████████████████████████████████████████████████| 11490/11490 [03:21<00:00, 57.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 11490/11490 [00:22<00:00, 518.42it/s]


In [4]:
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
cleaned_dataset = pd.read_csv('preprocessed_data.csv')

# Split the dataset into train_split, test_split, and validate_split
train_split, test_split = train_test_split(cleaned_dataset, test_size=0.2, random_state=42)
train_split, validate_split = train_test_split(train_split, test_size=0.25, random_state=42)

# Save the train_split, test_split, and validate_split to CSV files
train_split.to_csv('train_split.csv', index=False)
test_split.to_csv('test_split.csv', index=False)
validate_split.to_csv('validate_split.csv', index=False)


In [5]:
# Check the number of rows in each split
print("Number of rows in original dataset:", len(cleaned_dataset))
print("Number of rows in train_split:", len(train_split))
print("Number of rows in test_split:", len(test_split))
print("Number of rows in validate_split:", len(validate_split))


Number of rows in original dataset: 11490
Number of rows in train_split: 6894
Number of rows in test_split: 2298
Number of rows in validate_split: 2298
