In [None]:
import spacy
import pandas as pd
import re
from tqdm import tqdm

import os.path
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



tqdm.pandas()

In [None]:
# initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = set(stopwords.words('english'))
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# DatasetLoad

Model Param

In [None]:
path = "Docs/"
path_cl = "ProcDocs/"
docs = path +"all_docs.csv"
queries = path +"dev_queries.csv"
save_docs = path_cl +"all_docs_1.csv"
save_queries = path_cl +"dev_queries_1.csv"

In [None]:
os.mkdir(path_cl)

In [None]:
docs_df = pd.read_csv(docs)
docs_df.doc_text = docs_df.doc_text.apply(str)
docs_df = docs_df.drop_duplicates('doc_number').reset_index(drop=True)

In [None]:
queries_df = pd.read_csv(queries)

In [None]:
lucene = pd.read_csv('Docs/raw_dev_Lucene_retrievals.csv')
g_thruth = pd.read_csv('Docs/dev_data.csv')

## Selecy only relevant docs

In [None]:
list_docs = list(g_thruth['doc_number'])

In [None]:
docs_df = docs_df[docs_df.doc_number.isin(list_docs)].reset_index(drop=True)

# General

In [None]:
# Dictionary of english Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
"he'll've": "he will have","how'd": "how did","how'd'y": "how do you","how'll": "how will","i'd": "i would",
"i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have",
"isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have",
"let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not",
"mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have",
"needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
"oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will",
"she'll've": "she will have","should've": "should have","shouldn't": "should not",
"shouldn't've": "should not have","so've": "so have","that'd": "that would","that'd've": "that would have",
"there'd": "there would","there'd've": "there would have",
"they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have",
"they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would",
"we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
"weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are",
"what've": "what have","when've": "when have","where'd": "where did",
"where've": "where have","who'll": "who will","who'll've": "who will have","who've": "who have",
"why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
"would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have",
"you're": "you are","you've": "you have"}

In [None]:
# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

In [None]:
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [None]:
# Removes links, spaces, ecc
def clean_text(text):
    text=re.sub('\w*\d\w*','', text)
    text=re.sub('\n',' ',text)
    text=re.sub(r"http\S+", "", text)
    text=re.sub('[^a-z]',' ',text)
    text=re.sub(' +',' ',text)
    return text

In [None]:
def std_preprocess(text):
    raw = str(text).lower()
    raw = expand_contractions(raw)
    raw = clean_text(raw)
    return raw

In [None]:
#Calculates Terms frequency
word_freq = dict()
def calculate_freq_dict(texts):
    frequency = dict()
    for text in tqdm(texts):
        for token in text:
            if token in frequency:
                frequency[token] += 1
            else:
                frequency[token] = 0
    return frequency


In [None]:
# Remove terms with a low frequencies
def remove_insignificant(text, word_freq = word_freq, min_c = 2):
    final_text = []
    for x in text.split():
        if word_freq[x] >= min_c:
            final_text.append(x)
    return " ".join(final_text)

# Preprocess - Stem

In [None]:
#Stemming + Stop words if true
def preprocess_stem(text, stop = True):
    tokens = tokenizer.tokenize(text)
    stopped_stemmed_tokens = []
    if stop:
        stopped_stemmed_tokens = [p_stemmer.stem(i) for i in tokens if not i in en_stop]
    else:
        stopped_stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
    return " ".join(stopped_stemmed_tokens)

In [None]:
docs_df.doc_text = docs_df.doc_text.progress_apply(std_preprocess) #apply standard preprocess
queries_df['cl_q'] = queries_df.Query.progress_apply(std_preprocess)

In [None]:
# remove word with freq < 2
all_text = list(docs_df.doc_text.progress_apply(lambda x: x.split()))
word_freq = calculate_freq_dict(all_text)
docs_df.doc_text = docs_df.doc_text.progress_apply(remove_insignificant)

In [None]:
docs_df['stem_text'] = docs_df.doc_text.progress_apply(preprocess_stem) #apply stemminng
queries_df['stem_q'] = queries_df.cl_q.progress_apply(lambda x: preprocess_stem(x, False))

# Process - Lem

In [None]:
nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
nlp.max_length=5000000
nlp.pipeline

In [None]:
# Lemmatization + Stop words if true
def preprocess_lem(text, stop = True):
    if stop:
        stopped_lemmed_tokens = ' '.join([token.lemma_ for token in list(nlp(str(text))) if (token.is_stop==False)])
    else:
        stopped_lemmed_tokens = ' '.join([token.lemma_ for token in list(nlp(str(text)))])
    return stopped_lemmed_tokens

In [None]:
docs_df['lem_text'] = docs_df.doc_text.progress_apply(preprocess_lem)
queries_df['lem_q'] = queries_df.cl_q.progress_apply(lambda x: preprocess_lem(x, False))

# Save Preprocessed Files

In [None]:
docs_df.to_csv(save_docs, index = False)
queries_df.to_csv(save_queries, index = False)

# Apply preprocess to test_data.csv

In [None]:
test_data = pd.read_csv('Docs/test_data.csv')

In [None]:
train_docs = pd.read_csv('ProcDocs/Split_0.2/docs_train.csv')

In [None]:
test_data_lucene = test_data[['Query_number', 'doc_number']].copy().reset_index(drop=True)

In [None]:
test_data_query = test_data[['Query_number', 'Query']].copy().reset_index(drop=True)
test_data_query.Query = test_data_query.Query.apply(str)
test_data_query.Query_number = test_data_query.Query_number.apply(int)
test_data_query = test_data_query.drop_duplicates('Query_number').reset_index(drop=True)

In [None]:
test_data_doc = test_data[['doc_number', 'doc_text']].copy().reset_index(drop=True)
test_data_doc.doc_text = test_data_doc.doc_text.apply(str)
test_data_doc.doc_number = test_data_doc.doc_number.apply(int)
test_data_doc = test_data_doc.drop_duplicates('doc_number').reset_index(drop=True)

In [None]:
test_data_doc = test_data_doc[~(test_data_doc.doc_number.isin(list(train_docs.doc_number.unique())))].reset_index(drop=True)

In [None]:
test_data_query.Query = test_data_query.Query.progress_apply(std_preprocess)
test_data_doc.doc_text = test_data_doc.doc_text.progress_apply(std_preprocess)

In [None]:
# remove word with freq < 2 normal text
all_text = list(test_data_doc.doc_text.progress_apply(lambda x: x.split()))
word_freq = calculate_freq_dict(all_text)
test_data_doc.doc_text = test_data_doc.doc_text.progress_apply(lambda x: remove_insignificant(x, word_freq))

In [None]:
test_data_doc['lem_text'] = test_data_doc.doc_text.progress_apply(preprocess_lem)

In [None]:
test_data_query['lem_q'] = test_data_query.Query.progress_apply(lambda x: preprocess_lem(x, False))

In [None]:
test_data_doc.to_csv("ProcDocs/test_data/test_data_doc.csv", index = False)
test_data_query.to_csv("ProcDocs/test_data/test_data_query.csv", index = False)
test_data_lucene.to_csv("ProcDocs/test_data/test_data_lucene.csv", index = False)