In [1]:
import nltk, re
import pandas as pd

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ritar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ritar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def preprocesser(text: str) -> list:
    """
    Does stemming and removes stopwords and punctuation
    """
    
    snow_stemmer = SnowballStemmer(language='english')
    
    text = re.sub(r'\n|\r', ' ', text)       #Removes breaklines
    text = re.sub(r'[^\w\s]', ' ', text)       #Removes punctuation
    words = word_tokenize(text.lower())       #Tokenizes the text

    filtered_sentence = []
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(snow_stemmer.stem(w))

    return " ".join(filtered_sentence)

In [None]:
test = pd.read_json("./data/corpus.jsonl", lines=True)
test.head()

Unnamed: 0,id,title,text,keywords
0,1,!!!,!!! is a dance-punk band that formed in Sacram...,"[1996 establishments in California, American i..."
1,2,!!! (album),!!! is the eponymous debut album by !!!. It wa...,"[!!! albums, 2001 debut albums, English-langua..."
2,3,!!Destroy-Oh-Boy!!,!!Destroy-Oh-Boy!! is the debut album by the A...,"[1993 debut albums, Crypt Records albums, Engl..."
3,4,!Action Pact!,"!Action Pact! were a punk rock band, formed in...","[English punk rock groups, Musical groups dise..."
4,5,!Arriba! La Pachanga,!Arriba! La Pachanga is an album by Mongo Sant...,[1961 albums]


In [None]:
def parse_documents(documents):

  
  documents.rename(columns = {'id':'docno'}, inplace = True)
  documents['title'] = documents['title'].apply(lambda x: preprocesser(x))  
  documents['text'] = documents['text'].apply(lambda x: preprocesser(x))
  documents['keywords'] = documents['keywords'].apply(lambda x: " ".join([preprocesser(el) for el in x]))

  return documents

documents = parse_documents(test)
documents.head()

Unnamed: 0,docno,title,text,keywords
0,1,,danc punk band form sacramento california 1996...,1996 establish california american indi rock g...
1,2,album,eponym debut album releas 2001 gold standard l...,album 2001 debut album english languag album
2,3,destroy oh boy,destroy oh boy debut album american garag punk...,1993 debut album crypt record album english la...
3,4,action pact,action pact punk rock band form 1981 bad samar...,english punk rock group music group disestabli...
4,5,arriba la pachanga,arriba la pachanga album mongo santamaría publ...,1961 album


In [None]:
documents['new_text'] = documents['title'] + " " + documents['text'] + " " + documents['keywords']
documents.head()

Unnamed: 0,docno,title,text,keywords,new_text
0,1,,danc punk band form sacramento california 1996...,1996 establish california american indi rock g...,danc punk band form sacramento california 199...
1,2,album,eponym debut album releas 2001 gold standard l...,album 2001 debut album english languag album,album eponym debut album releas 2001 gold stan...
2,3,destroy oh boy,destroy oh boy debut album american garag punk...,1993 debut album crypt record album english la...,destroy oh boy destroy oh boy debut album amer...
3,4,action pact,action pact punk rock band form 1981 bad samar...,english punk rock group music group disestabli...,action pact action pact punk rock band form 19...
4,5,arriba la pachanga,arriba la pachanga album mongo santamaría publ...,1961 album,arriba la pachanga arriba la pachanga album mo...


In [None]:
documents.loc[3, 'new_text']

'action pact action pact punk rock band form 1981 bad samaritan guitarist wild planet bassist dr phibe drummer joe fungus english punk rock group music group disestablish 1986'

In [None]:
documents.drop(['title', 'text', 'keywords'], axis=1, inplace=True)
documents.head()

Unnamed: 0,docno,new_text
0,1,danc punk band form sacramento california 199...
1,2,album eponym debut album releas 2001 gold stan...
2,3,destroy oh boy destroy oh boy debut album amer...
3,4,action pact action pact punk rock band form 19...
4,5,arriba la pachanga arriba la pachanga album mo...


In [None]:
documents.to_csv("./data/parsed_corpus.csv", index=False)

## Preprocessing the queries

In [17]:
train_queries = pd.read_csv("./data/train_queries.csv")
train_queries.head()

Unnamed: 0,QueryId,Query
0,1,szechwan dish food cuisine
1,3,finland car industry manufacturer saab sisu
2,5,social network group selection
3,7,web ranking scoring algorithm
4,9,europe solar power facility


In [18]:
  train_queries['Query'] = train_queries['Query'].apply(lambda x: preprocesser(x))  
  train_queries.head()

Unnamed: 0,QueryId,Query
0,1,szechwan dish food cuisin
1,3,finland car industri manufactur saab sisu
2,5,social network group select
3,7,web rank score algorithm
4,9,europ solar power facil


In [19]:
train_queries.to_csv("./data/parsed_train_queries.csv", index=False)

In [20]:
test_queries = pd.read_csv("./data/test_queries.csv")
test_queries.head()

Unnamed: 0,QueryId,Query
0,2,roman architecture
1,4,france second world war normandy
2,6,d-day normandy invasion
3,8,eiffel
4,11,indian food


In [21]:
  test_queries['Query'] = queries['Query'].apply(lambda x: preprocesser(x))  
  test_queries.head()

Unnamed: 0,QueryId,Query
0,2,roman architectur
1,4,franc second world war normandi
2,6,day normandi inva
3,8,eiffel
4,11,indian food


In [22]:
test_queries.to_csv("./data/parsed_test_queries.csv", index=False)