In [1]:
import pandas as pd

# Load parquet file
file_path = 'nyt_data.parquet'
df = pd.read_parquet(file_path, engine='pyarrow')  # or engine='fastparquet'

# View the first few rows
print(df.head())


   year                                              title  \
0  1920  At last the Federal Reserve Board has issued r...   
1  1920                            WILL TEST DOOR SERVICE.   
2  1920                    Sanction for Chinese Contracts.   
3  1920                            LEADS FRAZIER BY 4,496.   
4  1920  CHICAGO, April 30.--With 300 suspicious charac...   

                                             excerpt  
0                                                     
1  Service Board to Further Examine I.R.T. Safety...  
2                                                     
3  Langer's Margin Falls in North Dakota--Gronna ...  
4  Federal Agents and Police Round-- up Suspiciou...  


In [2]:
print(df.shape)


(17370913, 3)


In [3]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

# Display the data types of each column
print(df.dtypes)

# Display the number of unique values in each column
print(df.nunique())

Number of rows: 17370913
Number of columns: 3
year        int64
title      object
excerpt    object
dtype: object
year            101
title      10718164
excerpt     5212707
dtype: int64


In [4]:

earliest_year = df['year'].min()
latest_year = df['year'].max()
num_years = latest_year - earliest_year + 1

print(f"Earliest Year: {earliest_year}")
print(f"Latest Year: {latest_year}")
print(f"Number of Years: {num_years}")

Earliest Year: 1920
Latest Year: 2020
Number of Years: 101


In [5]:
import pandas as pd
import re
!pip install nltk



from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load English stop words as a set
stop_words = set(stopwords.words('english'))

# Now `stop_words` is a set, and you can use it for text preprocessing




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\palap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Function to standardize text
def standardize_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()

        # Remove punctuation, numbers, and unwanted characters
        # Keep only alphabets and spaces (remove everything else)
        text = re.sub(r'[^a-z\s]', '', text)

        # Remove stop words (using set for fast lookup)
        text = ' '.join([word for word in text.split() if word not in stop_words])

        # Ensure there are no leading/trailing spaces
        text = text.strip()

        return text
    return ''

# Apply the standardization function to the 'title' and 'excerpt' columns separately, then combine
df['standardized_title'] = df['title'].apply(standardize_text)
df['standardized_excerpt'] = df['excerpt'].apply(standardize_text)
df['standardized_text'] = df['standardized_title'] + ' ' + df['standardized_excerpt']


In [7]:
# Now the DataFrame is ready with the standardized text
print(df.head())

   year                                              title  \
0  1920  At last the Federal Reserve Board has issued r...   
1  1920                            WILL TEST DOOR SERVICE.   
2  1920                    Sanction for Chinese Contracts.   
3  1920                            LEADS FRAZIER BY 4,496.   
4  1920  CHICAGO, April 30.--With 300 suspicious charac...   

                                             excerpt  \
0                                                      
1  Service Board to Further Examine I.R.T. Safety...   
2                                                      
3  Langer's Margin Falls in North Dakota--Gronna ...   
4  Federal Agents and Police Round-- up Suspiciou...   

                                  standardized_title  \
0  last federal reserve board issued rules organi...   
1                                  test door service   
2                         sanction chinese contracts   
3                                      leads frazier   
4  chicago

In [8]:
# Drop 'title' and 'excerpt' columns
df = df.drop(['title', 'excerpt', 'standardized_title', 'standardized_excerpt'], axis=1)


# Print the updated DataFrame
print(df.head())

   year                                  standardized_text
0  1920  last federal reserve board issued rules organi...
1  1920  test door service service board examine irt sa...
2  1920                        sanction chinese contracts 
3  1920  leads frazier langers margin falls north dakot...
4  1920  chicago april suspicious characters including ...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sampling 50% or 10% of the data to speed up the process
sample_df = df.sample(frac=0.1, random_state=42)  # 10% sample
print(sample_df)

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Use the sampled data for TF-IDF vectorization
tfidf_matrix = vectorizer.fit_transform(sample_df['standardized_text'])

          year                                  standardized_text
871974    1927  schooner morrissey receives food supplies sail...
3563372   1938  tenanted repairs new housing sped indianapolis...
11774055  1982  edwin h mosler jr former president chief execu...
11142793  1975  reprs procommunist trade unions neighborhood c...
10942590  1974  bill virdons first season manager pirates team...
...        ...                                                ...
3558555   1938  wash suburban sanitary dist md financing activ...
9829470   1966  british state mind insists upon rejection feel...
11486971  1978  article popularity rare tiffany glass notes me...
6323141   1950                                   equitable lends 
4934695   1944  takeout double used nearly thirty years strong...

[1737091 rows x 2 columns]


In [11]:
# Specify the output file path
output_path = 'processed_dataset2.parquet'

# Save the DataFrame to a Parquet file
df.to_parquet(output_path)

print(f"DataFrame saved successfully to {output_path}")

DataFrame saved successfully to processed_dataset2.parquet
