In [1]:
import pandas as pd

In [2]:
# Import the dataset
df_cleaned = pd.read_csv('df_cleaned.csv')

In [3]:
# Drop redundant index column
df_cleaned.drop('Unnamed: 0', axis =1, inplace = True)

In [4]:
df_cleaned.head()

Unnamed: 0,title,categories,abstract
0,Calculation of prompt diphoton production cros...,hep-ph,A fully differential calculation in perturba...
1,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"We describe a new algorithm, the $(k,\ell)$-..."
2,The evolution of the Earth-Moon system based o...,physics.gen-ph,The evolution of Earth-Moon system is descri...
3,A determinant of Stirling cycle numbers counts...,math.CO,We show that a determinant of Stirling cycle...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,In this paper we show how to compute the $\L...


In [5]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import swifter

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Define stop words
stop_words = set(stopwords.words('english'))

# Define function for text preprocessing
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)
    # Lowercasing, removing punctuation, special characters, stop words, and lemmatization
    words = [lemmatizer.lemmatize(word.lower().strip(string.punctuation)) 
             for word in words if word.lower() not in stop_words]
    # Join words back into a single string
    preprocessed_text = ' '.join(words)
    return preprocessed_text


def lowercase_categories(categories):
    # Lowercase the categories
    categories_lower = categories.lower()
    return categories_lower


# Apply text preprocessing to 'abstract' column using swifter
df_cleaned['cleaned_abstract'] = df_cleaned['abstract'].swifter.apply(preprocess_text)
df_cleaned['cleaned_title'] = df_cleaned['title'].swifter.apply(preprocess_text)

# Lowercase 'categories' column
df_cleaned['cleaned_categories'] = df_cleaned['categories'].swifter.apply(lowercase_categories)

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Pandas Apply:   0%|          | 0/2431235 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2431235 [00:00<?, ?it/s]

In [6]:
# Visualize the cleaned dataset
df_cleaned

Unnamed: 0,title,categories,abstract,cleaned_abstract,cleaned_title,cleaned_categories
0,Calculation of prompt diphoton production cros...,hep-ph,A fully differential calculation in perturba...,fully differential calculation perturbative qu...,calculation prompt diphoton production cross s...,hep-ph
1,Sparsity-certifying Graph Decompositions,math.CO cs.CG,"We describe a new algorithm, the $(k,\ell)$-...",describe new algorithm k ell pebble game...,sparsity-certifying graph decomposition,math.co cs.cg
2,The evolution of the Earth-Moon system based o...,physics.gen-ph,The evolution of Earth-Moon system is descri...,evolution earth-moon system described dark mat...,evolution earth-moon system based dark matter ...,physics.gen-ph
3,A determinant of Stirling cycle numbers counts...,math.CO,We show that a determinant of Stirling cycle...,show determinant stirling cycle number count u...,determinant stirling cycle number count unlabe...,math.co
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,math.CA math.FA,In this paper we show how to compute the $\L...,paper show compute lambda alpha norm alp...,dyadic lambda alpha lambda alpha,math.ca math.fa
...,...,...,...,...,...,...
2431230,On the origin of the irreversibility line in t...,supr-con cond-mat.supr-con,We report on measurements of the angular dep...,report measurement angular dependence irrevers...,origin irreversibility line thin ybacuo7 film ...,supr-con cond-mat.supr-con
2431231,Nonlinear Response of HTSC Thin Film Microwave...,supr-con cond-mat.supr-con,The non-linear microwave surface impedance o...,non-linear microwave surface impedance pattern...,nonlinear response htsc thin film microwave re...,supr-con cond-mat.supr-con
2431232,Critical State Flux Penetration and Linear Mic...,supr-con cond-mat.supr-con,The vortex contribution to the dc field (H) ...,vortex contribution dc field h dependent mic...,critical state flux penetration linear microwa...,supr-con cond-mat.supr-con
2431233,Density of States and NMR Relaxation Rate in A...,supr-con cond-mat.supr-con,We show that the density of states in an ani...,show density state anisotropic superconductor ...,density state nmr relaxation rate anisotropic ...,supr-con cond-mat.supr-con


In [7]:
# Save a dataset that contain only the cleaned features
df_cleaned = df_cleaned[['cleaned_title', 'cleaned_abstract', 'cleaned_categories']]
df_cleaned.to_csv('df_cleaned2.csv')