In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

In [2]:
# NLTK resources
print("Downloading NLTK resources...")
nltk.download('punkt')
nltk.download('stopwords')
print("NLTK resources downloaded!")

Downloading NLTK resources...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Omar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Omar\AppData\Roaming\nltk_data...


NLTK resources downloaded!


[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
input_file = "D:\\DownLoad\\projects\\Search Engine\\Olivia_Searchengine\\datacollection\\output\\cran.all.1400.csv"
output_file = "D:\\DownLoad\\projects\\Search Engine\\Olivia_Searchengine\\datacollection\\output\\cran_preprocessed.csv"

In [15]:
print("Loading the CSV file...")
df = pd.read_csv(input_file)
print("First few rows of the data:")
print(df.head())

Loading the CSV file...
First few rows of the data:
   Doc_NO                                              Title  \
0       1  experimental investigation of the aerodynamics...   
1       2  simple shear flow past a flat plate in an inco...   
2       3  the boundary layer in simple shear flow past a...   
3       4  approximate solutions of the incompressible la...   
4       5  one-dimensional transient heat conduction into...   

                                                 Bib  \
0                         j. ae. scs. 25, 1958, 324.   
1  department of aeronautical engineering, rensse...   
2  department of mathematics, university of manch...   
3                         j. ae. scs. 22, 1955, 728.   
4                         j. ae. scs. 24, 1957, 924.   

                                                Text  
0  experimental investigation of the aerodynamics...  
1  simple shear flow past a flat plate in an inco...  
2  the boundary layer in simple shear flow past a...  
3  app

In [None]:
# Step 1: Make Text lowercase and handle NaN
print("\nStep 1: Making Text lowercase...")
# Create a new list to store the lowercase text
lowercase_text = []
for text in df["Title"]:
    if pd.isna(text):  # Check for NaN
        lowercase_text.append("")
    else:
        lowercase_text.append(str(text).lower())  # Convert to string and lowercase
df["Text"] = lowercase_text
print("After lowercasing, first 2 rows of Text:")
print(df["Text"].head(2))


Step 1: Making Text lowercase...
After lowercasing, first 2 rows of Text:
0    experimental investigation of the aerodynamics...
1    simple shear flow past a flat plate in an inco...
Name: Text, dtype: object


In [None]:
# Step 2: Split Text into words (tokenize)
print("\nStep 2: Splitting Text into words...")
tokens_list = []
for text in df["Title"]:
    words = re.findall(r'\w+', text)  # Use regex to get words
    tokens_list.append(words)
df["Tokens"] = tokens_list
print("Tokens for first 2 rows:")
print(df["Tokens"].head(2))


Step 2: Splitting Text into words...
Tokens for first 2 rows:
0    [experimental, investigation, of, the, aerodyn...
1    [simple, shear, flow, past, a, flat, plate, in...
Name: Tokens, dtype: object


In [8]:
# Step 3: Remove stopwords
print("\nStep 3: Removing stopwords...")
stop_words = set(stopwords.words('english'))
no_stopwords_list = []
for tokens in df["Tokens"]:
    filtered_words = []
    for word in tokens:
        if word not in stop_words:
            filtered_words.append(word)
    no_stopwords_list.append(filtered_words)
df["No_Stopwords"] = no_stopwords_list
print("After removing stopwords, first 2 rows:")
print(df["No_Stopwords"].head(2))


Step 3: Removing stopwords...
After removing stopwords, first 2 rows:
0    [experimental, investigation, aerodynamics, wi...
1    [simple, shear, flow, past, flat, plate, incom...
Name: No_Stopwords, dtype: object


In [9]:
# Step 4: Compare stemmers
print("\nStep 4: Comparing different stemmers...")
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

# Lists for each stemmer
porter_stemmed_list = []
lancaster_stemmed_list = []
snowball_stemmed_list = []

for words in df["No_Stopwords"]:
    porter_words = []
    lancaster_words = []
    snowball_words = []
    for word in words:
        porter_words.append(porter.stem(word))
        lancaster_words.append(lancaster.stem(word))
        snowball_words.append(snowball.stem(word))
    porter_stemmed_list.append(porter_words)
    lancaster_stemmed_list.append(lancaster_words)
    snowball_stemmed_list.append(snowball_words)

df["Porter_Stem"] = porter_stemmed_list
df["Lancaster_Stem"] = lancaster_stemmed_list
df["Snowball_Stem"] = snowball_stemmed_list


Step 4: Comparing different stemmers...


In [10]:
# comparison for first document
print("Stemmer comparison for Doc_NO 1:")
print("Original (no stopwords):", df["No_Stopwords"][0])
print("Porter Stemmed:", df["Porter_Stem"][0])
print("Lancaster Stemmed:", df["Lancaster_Stem"][0])
print("Snowball Stemmed:", df["Snowball_Stem"][0])

Stemmer comparison for Doc_NO 1:
Original (no stopwords): ['experimental', 'investigation', 'aerodynamics', 'wing', 'slipstream', 'experimental', 'study', 'wing', 'propeller', 'slipstream', 'made', 'order', 'determine', 'spanwise', 'distribution', 'lift', 'increase', 'due', 'slipstream', 'different', 'angles', 'attack', 'wing', 'different', 'free', 'stream', 'slipstream', 'velocity', 'ratios', 'results', 'intended', 'part', 'evaluation', 'basis', 'different', 'theoretical', 'treatments', 'problem', 'comparative', 'span', 'loading', 'curves', 'together', 'supporting', 'evidence', 'showed', 'substantial', 'part', 'lift', 'increment', 'produced', 'slipstream', 'due', 'destalling', 'boundary', 'layer', 'control', 'effect', 'integrated', 'remaining', 'lift', 'increment', 'subtracting', 'destalling', 'lift', 'found', 'agree', 'well', 'potential', 'flow', 'theory', 'empirical', 'evaluation', 'destalling', 'effects', 'made', 'specific', 'configuration', 'experiment']
Porter Stemmed: ['experime

In [11]:
# Step 5: Join Snowball stemmed words into a string
print("\nStep 5: Joining Snowball stemmed words...")
processed_text_list = []
for stemmed_words in df["Snowball_Stem"]:
    joined_text = " ".join(stemmed_words)
    processed_text_list.append(joined_text)
df["Processed_Text"] = processed_text_list
print("Processed Text for first 2 rows:")
print(df["Processed_Text"].head(2))


Step 5: Joining Snowball stemmed words...
Processed Text for first 2 rows:
0    experiment investig aerodynam wing slipstream ...
1    simpl shear flow past flat plate incompress fl...
Name: Processed_Text, dtype: object


In [12]:
# Step 6: Save the processed new CSV
print("\nStep 6: Saving to a new CSV...")
output_df = df[["Doc_NO", "Title", "Bib", "Processed_Text"]]
output_df.to_csv(output_file, index=False)
print("Saved to", output_file)
print("Final output (first 2 rows):")
print(output_df.head(2))


Step 6: Saving to a new CSV...
Saved to cran_preprocessed.csv
Final output (first 2 rows):
   Doc_NO                                              Title  \
0       1  experimental investigation of the aerodynamics...   
1       2  simple shear flow past a flat plate in an inco...   

                                                 Bib  \
0                         j. ae. scs. 25, 1958, 324.   
1  department of aeronautical engineering, rensse...   

                                      Processed_Text  
0  experiment investig aerodynam wing slipstream ...  
1  simpl shear flow past flat plate incompress fl...  


In [13]:
print("\nFinal check: How many documents processed?")
print("Total rows:", len(df))


Final check: How many documents processed?
Total rows: 1400
