<a href="https://colab.research.google.com/github/Vikas-KM/quora-question-pair/blob/main/solving_quora_question_pair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
zf = zipfile.ZipFile('/kaggle/input/quora-question-pairs/train.csv.zip')
df_train = pd.read_csv(zf.open('train.csv'))
df_train.head()

### Examples of duplicate examples

In [None]:
# change the index of iloc to see different questions to get and idea
df1 = df_train[df_train['is_duplicate'] == 0]
df2 = df_train[df_train['is_duplicate'] == 1]

qstn1 = df1.iloc[0]['question1']
qstn2 = df1.iloc[0]['question2']
is_dup1 = df1.iloc[0]['is_duplicate']

qstn3 = df2.iloc[0]['question1']
qstn4 = df2.iloc[0]['question2']
is_dup2 = df2.iloc[0]['is_duplicate']


print(qstn1)
print(qstn2)
print('are they duplicates? ', is_dup1)

print(qstn3)
print(qstn4)
print('are they duplicates? ', is_dup2)

## basic analysis on the train data

In [None]:
# how many data points
df_train.shape

In [None]:
# names of the columns
df_train.columns

In [None]:
# how is data spread, finding the balance/imbalance of the data
df_train['is_duplicate'].value_counts()

In [None]:
# any null values present
df_train.isna().sum()

In [None]:
df_train.info()

In [None]:
print('{}% of duplicate pairs of question'.format(round(df_train['is_duplicate'].mean()*100,2)))

## Exploratory Data Analysis

In [None]:
# count of duplicate and not duplicate questions
df_train.groupby('is_duplicate')['id'].count().plot.bar()


### references
- https://datatofish.com/convert-pandas-dataframe-to-list/
- https://queirozf.com/entries/pandas-dataframe-examples-duplicated-data

In [None]:
df_train['qid1'].value_counts()

In [None]:
df_train['qid2'].value_counts()

### Observations
- Some qids are repeated, that means few questions are repeating

In [None]:
# number of unique questions
qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
total_qstns = len(qids)
unique_qstns = len(np.unique(qids))
repeated_qstns = np.sum(qids.value_counts() >1)
print('Total number of questions ',total_qstns)
print('Total number of uniques questions ',unique_qstns)
print('Total number of repeated questions',repeated_qstns)

In [None]:
x=['unique questions', 'repeated questions']
y=[unique_qstns, repeated_qstns]
sns.barplot(x, y)

In [None]:
# how many times questions are repeated max?
qids.value_counts().iloc[:10]

In [None]:
# questions are in huge numbers so taking logscale for y axis
# nonposy=clip mean negative of log not defined so here a small value is taken

plt.figure(figsize=(12, 8))
plt.hist(qids.value_counts(), bins=120)
plt.yscale('log', nonposy='clip')
plt.title('Log-Histogram of question appearance counts')
plt.xlabel('Number of occurences of question')
plt.ylabel('Number of questions')

### Observations
- As from the above plot we can see, there is a question that is repeated 157, 120, 111 times
( see the above plot query for 157 number)

In [None]:
df_train.isna().sum()

In [None]:
# finding rows that have NaN values
df_train[df_train.isna().any(1)]

### Observations
- There are 3 rows which have NaN value
    - we can delete those rows
    - we can fill them with a empty string
    
   
Since the NaN value are only 3 we will discard/drop them

### references
- https://stackoverflow.com/questions/13851535/how-to-delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html

In [None]:
# droping the NaN values
df_train.dropna(axis=0, how='any', inplace=True)

In [None]:
df_train[df_train.isna().any(1)]

In [None]:
df_train.isna().sum()

In [None]:
df_train.shape

In [None]:
## Checking if any pair is duplicate
dup = df_train[['qid1','qid2','is_duplicate']].groupby(['qid1','qid2']).count().reset_index()
df_train.shape[0] - dup.shape[0]

### Naive Submission

In [None]:
from sklearn.metrics import log_loss

p = df_train['is_duplicate'].mean() # Our predicted probability
print('Predicted score:', log_loss(df_train['is_duplicate'], np.zeros_like(df_train['is_duplicate']) + p))

# zf = zipfile.ZipFile('/kaggle/input/quora-question-pairs/test.csv.zip')
# df_test = pd.read_csv(zf.open('test.csv'))

df_test = pd.read_csv('/kaggle/input/quora-question-pairs/test.csv')
sub = pd.DataFrame({'test_id': df_test['test_id'], 'is_duplicate': p})
sub.to_csv('naive_submission.csv', index=False)
sub.head()


In [None]:
df = pd.read_csv('./naive_submission.csv')
df.shape

## Feature Engineering

### Lets create a few new Features
- **freq_qid1, freq_qid2** -> Frequency count of the qids
- **qlen1, qlen2** -> Length of the question

- **q1_words, q2_words** -> Number of words in the question

In [None]:
# copying the df_train to df
df = df_train.copy()
df.head()

### References
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transform.html
- https://pbpython.com/pandas_transform.html

In [None]:
# using pandas transform to count the frequency of the qstn based on qid
df['freq_qid1'] = df.groupby('qid1')['qid1'].transform('count')
df['freq_qid2'] = df.groupby('qid2')['qid2'].transform('count')
df.head()

In [None]:
# finding the length of the qstn and creating a new feature
df['qlen1'] = df['question1'].apply(lambda x:len(x))
df['qlen2'] = df['question2'].apply(lambda x:len(x))
df.head()

In [None]:
# number of words in the questions
df['q1_words'] = df['question1'].apply(lambda x: len(x.split(' ')))
df['q2_words'] = df['question2'].apply(lambda x: len(x.split(' ')))
df.head()

### References
- https://stackoverflow.com/questions/11938964/how-to-find-common-words-and-print-them-using-python-command/12136296

    - common = set(document_1_words).intersection( set(document_2_words) )
    - unique = set(document_1_words).symmetric_difference( set(document_2_words) )
    
    
- **common_words** -> common words to question1 and question2
- **total_words** -> total unique words to belonging to both question1 and question2
- **share_words** -> the share words is defined as common words divided by total words

In [None]:
# common words to both qstn1 and qstn2 
def word_common(row):
        w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
        return len(w1.intersection(w2))
df['common_words'] = df.apply(word_common, axis=1)

df.head()

In [None]:
# Total words of both qstn1 and qstn2 
def word_total(row):
        w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
        return len(w1)+len(w2)
df['total_words'] = df.apply(word_total, axis=1)

df.head()

In [None]:
# words shared between of both qstn1 and qstn2 
# gives us an idea as to how similar the two qstns maybe
# higher the share words implies more similar the wordings are of the two sentences

df['share_words'] = df['common_words']/(df['total_words'])
df.head()

### Analysis from the extracted features

In [None]:
# minimum length of the question1
print('minimum length of the qstn1 is ',min(df['qlen1']))

# minimum length of the question2
print('minimum length of the qstn2 is ',min(df['qlen2']))

In [None]:
# questions of minimum length
print(df[df['qlen1']== 1].shape[0])
df[df['qlen1']== 1]['question1']

In [None]:
print(df[df['qlen2']== 1].shape[0])
df[df['qlen2']== 1]['question2']

In [None]:
df.shape[0]

### Observation
- There are 19 questions of Question1 which has only 1 character
- There are 2 questions of Question2 which has only 1 character

These 21 form a miniscule part(0.005%) of the training, we can drop them

In [None]:
min(df['q1_words'])

In [None]:
print(df[df['q1_words']== 1].shape[0])
df[df['q1_words']== 1]['question1']
print(df[df['q1_words']== 1]['question1'].value_counts())

In [None]:
min(df['q2_words'])

In [None]:
print(df[df['q2_words']== 1].shape[0])
df[df['q2_words']== 1]['question2']
print(df[df['q2_words']== 1]['question2'].value_counts())

In [None]:
df[df['question2']=='Spam']

In [None]:
df[df['question2']=='deleted']

### Observations
- There are 66 and 22 question with one word in question1 and Question2
- bunch of keywords are deleted/delete spam and lol and dots conveying nothing at all

In [None]:
df['share_words'][0:].head()

##### Can the share_words be used to see if it will help to separate the given question pairs?

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(df[df['is_duplicate'] == 1.0]['share_words'], label = "is_duplicate", color = 'green')
sns.distplot(df[df['is_duplicate'] == 0.0]['share_words'], label = "not_duplicate" , color = 'blue' )
plt.legend()
plt.show()

### Observation
- green indicates duplicates, higher share words means they may be duplicates
- blue indicates not duplicates, lower share words means they may not be duplicates

Since there is lot of overlap it is not strict separation as can be seen from the above graph

In [None]:
plt.figure(figsize=(12,8))
x = df['is_duplicate']
y = df['share_words']
sns.violinplot(x,y, hue=df['is_duplicate'])
plt.show()

##### Can the common_words be used to see if it will help to separate the given question pairs?

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(df[df['is_duplicate'] == 1.0]['common_words'], label = "is_duplicate", color = 'green')
sns.distplot(df[df['is_duplicate'] == 0.0]['common_words'], label = "not_duplicate" , color = 'blue' )
plt.legend()
plt.show()

#### Observation:
- too much overlap

## Text Preprocessing

- Removing HTML Tags
- Removing Punctuations
- Removing Numbers
- Performing Stemming
- Removing Stop words etc

In [None]:
import re

#### Code to remove URL links from text

In [None]:
# https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/40823105#40823105
def remove_URL(text):
    """Remove URLs from a text string"""
    return re.sub(r"http\S+", "", text)

In [None]:
df['question1'] = df['question1'].apply(lambda x: remove_URL(x))
df.head()

In [None]:
df['question2'] = df['question2'].apply(lambda x: remove_URL(x))
df.head()

In [None]:
from bs4 import BeautifulSoup

#### Code to remove tags using beautifulSoup

In [None]:
# https://stackoverflow.com/questions/16206380/python-beautifulsoup-how-to-remove-all-tags-from-an-element
def getText(x):
    soup = BeautifulSoup(x, 'lxml')
    text = soup.get_text()
    return text

In [None]:
df['question1'] = df['question1'].apply(lambda x: getText(x))
df.head()

In [None]:
df['question2'] = df['question2'].apply(lambda x: getText(x))
df.head()

#### Expanding English language contractions in Python

In [None]:
! pip install contractions

In [None]:
import contractions
print(contractions.fix("you've"))
print(contractions.fix("he's"))
print(contractions.fix("'ll"))

In [None]:
#https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python/47091490#47091490

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
# removing special characters
def remove_spl(x):
    x = re.sub('[^A-Za-z0-9]+', '', x)
    return x

### References:
- https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb/notebook