# Application of TF-IDF - on top of Stemming Text Preprocessing 

## Import Required Libraries

In [2]:
import pandas as pd
import numpy as np

import re
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('stopwords')
# Downloading wordnet before applying Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

# For serialization and de-serialization
import numpy as np
from pickle import load
from pickle import dump
# VIZ Style 
sns.set_style('whitegrid')
plt.style.use('bmh')

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# for HD visualizations
%config InlineBackend.figure_format='retina'

[nltk_data] Downloading package stopwords to C:\Users\GUDLA
[nltk_data]     RAGUWING\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package wordnet to C:\Users\GUDLA
[nltk_data]     RAGUWING\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

[nltk_data] Downloading package omw-1.4 to C:\Users\GUDLA
[nltk_data]     RAGUWING\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Note:
* Here in this code file, first we are going to load the cleaned csv file that we obtained from after performing EDA on the original csv file. We will be applying `Stemming` and on top of Stemming we will apply `TF-IDF - Term frequency and inverse document frequency`.

In [101]:
df_cleaned = pd.read_csv(r"C:\Users\GUDLA RAGUWING\Data Science Course\Internship_Project\cleaned_df.csv") # Load the data

In [102]:
y = df_cleaned['is_duplicate']
X = df_cleaned[['question1','question2']] # Rearrange the columns

In [103]:
from sklearn.model_selection import train_test_split
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
X_train.shape
X_test.shape
y_train.shape
y_test.shape

(283003, 2)

(121287, 2)

(283003,)

(121287,)

## Applying Stemming Text Preprocessing technique on Train Data

In [9]:
## initialise the inbuilt Stemmer
stemmer = PorterStemmer()

In [10]:
def preprocess(raw_text, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [11]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
'''tqdm.pandas() is a method provided by tqdm library  that allows you to create/apply progress bars to pandas operations.
Works for pandas series as well as DataFrame, 
you can visualize the progress of your operations and get an estimate amount of time to complete the pandas task'''

'tqdm.pandas() is a method provided by tqdm library  that allows you to create/apply progress bars to pandas operations.\nWorks for pandas series as well as DataFrame, \nyou can visualize the progress of your operations and get an estimate amount of time to complete the pandas task'

In [12]:
temp_df = X_train['question1'].progress_apply(lambda x: preprocess(x, 'stem'))

temp_df.head()

100%|█████████████████████████████████████████████████████████████████████████| 283003/283003 [16:32<00:00, 285.05it/s]


Unnamed: 0,0,1
20128,work environ sbi life mumbai,5
296237,us citizen work canada,4
107095,benefit wash hand soap,4
27940,holi scriptur hinduism compar contrast taoism,6
251434,humanoid shape inevit speci similar intellig a...,10


In [14]:
temp_df1 = X_train['question2'].progress_apply(lambda x: preprocess(x, 'stem'))

temp_df1.head()

100%|█████████████████████████████████████████████████████████████████████████| 283003/283003 [14:00<00:00, 336.82it/s]


Unnamed: 0,0,1
20128,stress work sbi clerk,4
296237,us graduat degre help non us citizen get job c...,14
107095,import wash hand soap,4
27940,holi scriptur hinduism compar contrast italo r...,8
251434,rna evolv inevit eventu intellig life form pla...,14


In [15]:
train_stem = pd.concat([temp_df,temp_df1], axis = 1)

In [16]:
train_stem.columns = ['Text_stem_Q1','Text_len_Q2','Text_stem_Q2','Text_len_Q2']

In [17]:
train_stem.head()

Unnamed: 0,Text_stem_Q1,Text_len_Q2,Text_stem_Q2,Text_len_Q2.1
20128,work environ sbi life mumbai,5,stress work sbi clerk,4
296237,us citizen work canada,4,us graduat degre help non us citizen get job c...,14
107095,benefit wash hand soap,4,import wash hand soap,4
27940,holi scriptur hinduism compar contrast taoism,6,holi scriptur hinduism compar contrast italo r...,8
251434,humanoid shape inevit speci similar intellig a...,10,rna evolv inevit eventu intellig life form pla...,14


In [18]:
train_stem.isna().sum()

Text_stem_Q1    0
Text_len_Q2     0
Text_stem_Q2    0
Text_len_Q2     0
dtype: int64

In [24]:
train_stem[train_stem.Text_stem_Q1 == '']

Unnamed: 0,Text_stem_Q1,Text_len_Q2,Text_stem_Q2,Text_len_Q2.1
215415,,0,possibl broader chest shoulder skeleton frame ...,13
360644,,0,ever ask exist,3
296089,,0,good compani studi digit social media market,7
144890,,0,pro con olymp lift powerlift athlet train simu...,8
185405,,0,get hurt footbal new england patriot game,7
...,...,...,...,...
216877,,0,answer zero divid zero infin,5
329933,,0,better moral exampl muhammad jesu,5
297572,,0,,0
392095,,0,equal,1


In [20]:
train_stem.shape

(283003, 4)

## Observations:
* After applying the lemmatization feature extraction some of the questions which are of no important i.e. stop words will be removed and we will get a value of zero. So when we load the data using pandas we will get null values.

In [21]:
train_stem.to_csv('stem_pre_df.csv',index=False)

## Applying Lemmatization feature extraction technique on Test Data

In [22]:
temp_df2 = X_test['question1'].progress_apply(lambda x: preprocess(x, 'stem'))

temp_df2.head()

100%|█████████████████████████████████████████████████████████████████████████| 121287/121287 [04:55<00:00, 411.08it/s]


Unnamed: 0,0,1
8067,play pok mon go korea,5
368101,best side dish crab cake,5
70497,advis better materi crash test automobil ducti...,8
226567,improv logic program skill,4
73186,close see rd world war,5


In [23]:
temp_df3 = X_test['question2'].progress_apply(lambda x: preprocess(x, 'stem'))

temp_df3.head()

100%|█████████████████████████████████████████████████████████████████████████| 121287/121287 [05:13<00:00, 387.27it/s]


Unnamed: 0,0,1
8067,play pok mon go china,5
368101,good side dish buffalo chicken,5
70497,best server setup buddypress,4
226567,improv logic skill program,4
73186,close world war iii,4


In [27]:
test_stem = pd.concat([temp_df2,temp_df3], axis = 1)

In [28]:
test_stem.columns = ['Text_stem_Q1','Text_len_Q2','Text_stem_Q2','Text_len_Q2']

In [29]:
test_stem

Unnamed: 0,Text_stem_Q1,Text_len_Q2,Text_stem_Q2,Text_len_Q2.1
8067,play pok mon go korea,5,play pok mon go china,5
368101,best side dish crab cake,5,good side dish buffalo chicken,5
70497,advis better materi crash test automobil ducti...,8,best server setup buddypress,4
226567,improv logic program skill,4,improv logic skill program,4
73186,close see rd world war,5,close world war iii,4
...,...,...,...,...
35923,interest app program mobil phone comput,6,interest app program mobil phone comput,6
307141,amphiboli exampl,2,amphiboli exampl,2
295384,improv english speak,3,continu improv english,3
320598,metal extract electrolysi,3,metal extract electrolysi,3


In [31]:
test_stem.isna().sum()

Text_stem_Q1    0
Text_len_Q2     0
Text_stem_Q2    0
Text_len_Q2     0
dtype: int64

In [34]:
test_stem[test_stem.Text_stem_Q1 == '']

Unnamed: 0,Text_stem_Q1,Text_len_Q2,Text_stem_Q2,Text_len_Q2.1
198913,,0,,0
230050,,0,,0
302123,,0,,0
246126,,0,,0
301583,,0,guy,1
230743,,0,good safe place near st loui mo easi public tr...,11
190570,,0,time money one explain work,5
7368,,0,,0
52880,,0,,0
106766,,0,next number,2


In [36]:
test_stem.shape

(121287, 4)

## Observations:
* After applying the lemmatization feature extraction some of the questions which are of no important i.e. stop words will be removed and we will get a value of zero. So when we load the data using pandas we will get null values.

In [37]:
test_stem.to_csv('stem_pre_df1.csv',index=False)

## Handling Null Values on Train

In [104]:
tfidf_train = pd.read_csv(r"C:\Users\GUDLA RAGUWING\Data Science Course\Internship_Project\stem_pre_df.csv")

In [105]:
tfidf_train.isna().sum()

Text_stem_Q1     122
Text_len_Q2        0
Text_stem_Q2     122
Text_len_Q2.1      0
dtype: int64

In [106]:
tfidf_train[(tfidf_train.Text_stem_Q1.isna()) | (tfidf_train.Text_stem_Q2.isna())] .head()

Unnamed: 0,Text_stem_Q1,Text_len_Q2,Text_stem_Q2,Text_len_Q2.1
294,valu,1,,0
665,g mean,2,,0
1207,,0,possibl broader chest shoulder skeleton frame ...,13
2210,,0,ever ask exist,3
4122,,0,good compani studi digit social media market,7


In [107]:
drop = tfidf_train[(tfidf_train.Text_stem_Q1.isna()) | (tfidf_train.Text_stem_Q2.isna())].index.to_list()
tfidf_train.drop(drop,axis = 0,inplace = True)
tfidf_train.reset_index(inplace = True,drop = 'first' )

In [108]:
tfidf_train.shape

(282811, 4)

In [109]:
y_train = pd.DataFrame(data = y_train) # Convert y_train to dataframe
y_train.reset_index(inplace = True, drop = 'first')

In [110]:
y_train.drop(drop,axis = 0,inplace = True) # Drop the labels of the null value indexes of tfidf_train
y_train.reset_index(inplace = True,drop = 'first' )

In [111]:
y_train.shape

(282811, 1)

### DataFrame.squeeze() 

> You can then use DataFrame.squeeze() to convert the DataFrame with a single column into a Series:

In [112]:
y_train = y_train.squeeze()
y_train.shape

(282811,)

## Handling Null values on test

In [113]:
tfidf_test = pd.read_csv(r"C:\Users\GUDLA RAGUWING\Data Science Course\Internship_Project\stem_pre_df1.csv")

In [114]:
tfidf_test.head()

Unnamed: 0,Text_stem_Q1,Text_len_Q2,Text_stem_Q2,Text_len_Q2.1
0,play pok mon go korea,5,play pok mon go china,5
1,best side dish crab cake,5,good side dish buffalo chicken,5
2,advis better materi crash test automobil ducti...,8,best server setup buddypress,4
3,improv logic program skill,4,improv logic skill program,4
4,close see rd world war,5,close world war iii,4


In [115]:
tfidf_test.isna().sum()

Text_stem_Q1     55
Text_len_Q2       0
Text_stem_Q2     53
Text_len_Q2.1     0
dtype: int64

In [116]:
drop1 = tfidf_test[(tfidf_test.Text_stem_Q1.isna())].index.to_list()
tfidf_test.drop(drop1,axis = 0,inplace = True)
tfidf_test.reset_index(inplace = True,drop = 'first' )

In [117]:
tfidf_test.isna().sum()

Text_stem_Q1      0
Text_len_Q2       0
Text_stem_Q2     31
Text_len_Q2.1     0
dtype: int64

In [118]:
y_test = pd.DataFrame(data = y_test)
y_test.reset_index(inplace = True, drop = 'first')

In [119]:
y_test.drop(drop1,axis = 0,inplace = True) # Drop the labels of the null value indexes of tfidf_test
y_test.reset_index(inplace = True,drop = 'first' )

In [120]:
y_test.shape

(121232, 1)

In [121]:
drop2=tfidf_test[(tfidf_test.Text_stem_Q2.isna())].index.to_list()
tfidf_test.drop(drop2,axis = 0,inplace = True)
tfidf_test.reset_index(inplace = True,drop = 'first' )

In [122]:
tfidf_test.isna().sum()

Text_stem_Q1     0
Text_len_Q2      0
Text_stem_Q2     0
Text_len_Q2.1    0
dtype: int64

In [123]:
tfidf_test.shape

(121201, 4)

In [124]:
y_test.drop(drop2,axis = 0,inplace = True) # Drop the labels of the null value indexes of tfidf_test
y_test.reset_index(inplace = True,drop = 'first' )

In [125]:
y_test.shape

(121201, 1)

In [126]:
y_test = y_test.squeeze()

In [127]:
y_test.shape

(121201,)

### Importing TfidfVectorizer 

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer

vocab = TfidfVectorizer()


### Applying TFIDF on Train data

In [129]:
X_train_tfidf = vocab.fit_transform(tfidf_train['Text_stem_Q1']+tfidf_train['Text_stem_Q2'])

In [130]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_train_tfidf))

print("Shape of input data:", X_train_tfidf.shape)

Total unique words: 207633
Type of train features: <class 'scipy.sparse._csr.csr_matrix'>
Shape of input data: (282811, 207633)


In [131]:
X_train_tfidf

<282811x207633 sparse matrix of type '<class 'numpy.float64'>'
	with 2212474 stored elements in Compressed Sparse Row format>

### Applying TFIDF on Test data

In [132]:
X_test_tfidf = vocab.transform(tfidf_test['Text_stem_Q1']+tfidf_test['Text_stem_Q2'])

In [133]:
print("Total unique words:", len(vocab.vocabulary_))

print("Type of train features:", type(X_test_tfidf))

print("Shape of input data:", X_test_tfidf.shape)

Total unique words: 207633
Type of train features: <class 'scipy.sparse._csr.csr_matrix'>
Shape of input data: (121201, 207633)


In [134]:
X_test_tfidf

<121201x207633 sparse matrix of type '<class 'numpy.float64'>'
	with 883710 stored elements in Compressed Sparse Row format>

### Serializing the sparse matrix's of bow representation

In [136]:
#dump(X_test_tfidf, open("D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\X_train_tfidf.pkl", 'wb'))
#dump(X_train_tfidf, open("D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\X_test_tfidf.pkl", 'wb'))

In [137]:
#dump(y_train, open(r"D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\y_train.pkl", 'wb'))
#dump(y_test, open(r"D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\y_test.pkl", 'wb'))

### De - Serializing the sparse matrix's of bow representation

In [138]:
# Loading pretrained objects from pickle file

X_train_tfidf = load(open("D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\X_train_tfidf.pkl", 'rb'))
X_test_tfidf = load(open("D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\X_test_tfidf.pkl", 'rb'))
y_train = load(open(r"D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\y_train.pkl", 'rb'))
y_test = load(open(r"D:\Deep_learning\internship project\TFIDF's\TFIDF-objects\y_test.pkl", 'rb'))

In [139]:
X_train_tfidf
X_test_tfidf
y_train
y_test

<121201x207633 sparse matrix of type '<class 'numpy.float64'>'
	with 883710 stored elements in Compressed Sparse Row format>

<282811x207633 sparse matrix of type '<class 'numpy.float64'>'
	with 2212474 stored elements in Compressed Sparse Row format>

0         0
1         0
2         1
3         0
4         0
         ..
282806    0
282807    1
282808    1
282809    0
282810    1
Name: is_duplicate, Length: 282811, dtype: int64

0         0
1         0
2         0
3         1
4         1
         ..
121196    0
121197    1
121198    1
121199    1
121200    0
Name: is_duplicate, Length: 121201, dtype: int64