In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('test.csv')

# Create a feature for stratification (e.g., length of the articles)
data['article_length'] = data['article'].apply(len)

# Use quantiles to create bins
data['article_length_bin'] = pd.qcut(data['article_length'], q=10, labels=False)

# Perform stratified sampling
reduced_data, _ = train_test_split(data, train_size=6000, stratify=data['article_length_bin'], random_state=42)

# Verify the distribution to ensure it remains consistent
print(data['article_length_bin'].value_counts())
print(reduced_data['article_length_bin'].value_counts())

# Save the reduced dataset to a new CSV file
reduced_data.to_csv('reduced_test.csv', index=False)


article_length_bin
1    1150
6    1150
8    1150
0    1149
3    1149
4    1149
5    1149
2    1149
9    1148
7    1147
Name: count, dtype: int64
article_length_bin
1    601
6    601
4    600
8    600
3    600
5    600
0    600
2    600
9    599
7    599
Name: count, dtype: int64


In [2]:
import pandas as pd

# Load the reduced train.csv file
reduced_data = pd.read_csv('reduced_test.csv')

# Count the total number of records
total_records = len(reduced_data)

print(f"Total number of records in the reduced train.csv file: {total_records}")


Total number of records in the reduced train.csv file: 6000


In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm  # For progress bar

# Uncomment the downloads if needed
# nltk.download('punkt')
# nltk.download('stopwords')

def clean_text(text):
    # Remove special characters and junk using regex
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def preprocess_text(text):
    cleaned_text = clean_text(text)
    tokens = tokenize(cleaned_text)
    tokens = remove_stopwords(tokens)
    return ' '.join(tokens)

def main():
    # Load the dataset
    data = pd.read_csv('reduced_test.csv')

    # Output file paths
    input_file_path = 'input1.txt'
    output_file_path = 'output2.txt'

    # Save the input data to input.txt
    with open(input_file_path, 'w', encoding='utf-8') as input_file:
        for article in tqdm(data['article'], desc="Writing input data"):
            input_file.write(article + '\n\n')

    # Process and save the cleaned data to output.txt
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for article in tqdm(data['article'], desc="Processing and writing cleaned data"):
            cleaned_text = preprocess_text(article)
            output_file.write(cleaned_text + '\n\n')

    print("Processing completed. Input text saved to 'input.txt' and cleaned text saved to 'output.txt'.")

if __name__ == "__main__":
    main()


Writing input data: 100%|███████████████████████████████████████████████████████| 6000/6000 [00:00<00:00, 39822.49it/s]
Processing and writing cleaned data:  16%|██████▋                                  | 971/6000 [00:04<00:20, 247.03it/s]