In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from IPython.display import clear_output

tqdm.pandas()
pd.options.display.max_colwidth = -1
pd.options.display.max_columns = 25

In [3]:
from dsutils.eda import *
from dsutils.stats import *
from dsutils.ml import *

In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
STOP_WORDS = stopwords.words("english")
ps = PorterStemmer()

In [5]:
def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    
    x = re.sub(r'\W', ' ', str(x))
    
    x = re.sub(r"\s+", ' ', str(x))
    
    x = ' '.join(ps.stem(word) for word in x.split())
    
    soup = BeautifulSoup(str(x))
    x = soup.get_text()

    x = ' '.join([word for word in x.split() if word not in STOP_WORDS])
    
    return x

In [6]:
q1train = pd.read_csv('fuzzy_features_clean.csv', usecols=['q1_clean'])

In [7]:
q2train = pd.read_csv('fuzzy_features_clean.csv', usecols=['q2_clean'])

In [8]:
q1test = pd.read_csv('test.csv', usecols=['question1'])
q1test['question1'] = q1test['question1'].progress_apply(preprocess)

HBox(children=(IntProgress(value=0, max=2345796), HTML(value='')))




In [9]:
q2test = pd.read_csv('test.csv', usecols=['question2'])
q2test['question2'] = q2test['question2'].progress_apply(preprocess)

HBox(children=(IntProgress(value=0, max=2345796), HTML(value='')))




In [10]:
q1train.columns = ['question']
q2train.columns = ['question']
q1test.columns = ['question']
q2test.columns = ['question']

In [11]:
questions = pd.concat([q1train, q2train, q1test, q2test]).reset_index(drop=True)

In [12]:
questions['len'] = questions['question'].progress_apply(lambda x : len(str(x).split()))
print(feature_distribution(questions, 'len', None))

HBox(children=(IntProgress(value=0, max=5499984), HTML(value='')))


         min  1st  25th  50th  75th  99th  max  mean   std
overall  0    1.0  4.0   5.0   7.0   17.0  108  5.99  3.38


In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [14]:
import pickle

In [15]:
print(len(questions))
questions.drop(questions[questions['len'] == 0].index, inplace=True)
questions.dropna(inplace=True)
print(len(questions))

5499984
5479881


In [16]:
cv = CountVectorizer(binary=True)
cv.fit(questions['question'])
pickle.dump(cv, open('BinaryCountVectorizer.pkl', 'wb'))

In [17]:
tv = TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True)
tv.fit(questions['question'])
pickle.dump(tv, open('TfidfVectorizer.pkl', 'wb'))