In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

from scipy.sparse import coo_matrix

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import nltk
from nltk.corpus import stopwords

##### Additional packages / files to be downloaded 

In [3]:
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sergigomezpalleja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
q = pd.read_csv('../input/questions.csv')
ans = pd.read_csv('../input/answers.csv')

In [26]:
qa = ans.merge(q, left_on = 'answers_question_id', right_on = 'questions_id',
                        how = 'outer')

### Processing text

#### Cleaning text

In [4]:
def process_text(df, col):
    df[col] = df[col].str.replace('[^\w\s]','') # replacing punctuations
    df[col] = df[col].str.replace('-',' ') # replacing dashes
    df[col] = df[col].str.replace('\d+','') # replacing digits
    df[col] = df[col].str.lower().str.split() # convert all str to lowercase    
    df[col] = df[col].apply(lambda x: [item for item in x if item not in stop]) # remove stopwords    
    df[col] = df[col].apply(' '.join) # convert list to str
    return df

In [7]:
q['questions_title_orig'] = q['questions_title'] 
q['questions_body_orig'] = q['questions_body'] 
q = process_text(q, "questions_title") 
q = process_text(q, "questions_body") 

#### Create indexing

In [8]:
q_id_to_idx = {q_id: i for i, q_id in zip(q.index, q.questions_id)}

#### Text to numerical features

In [9]:
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1,2),
                     min_df=3,
                     max_df=0.9,
                     stop_words='english')
tfidf_matrix = tf.fit_transform(q['questions_body'])
print("Shape of TFiDF matrix: ", tfidf_matrix.shape)

Shape of TFiDF matrix:  (23931, 27875)


### Similarity between questions

In [10]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
q.questions_title_orig.values[0:5]

array(['Teacher   career   question',
       'I want to become an army officer. What can I do to become an army officer?',
       'Will going abroad for your first job increase your chances for jobs back home?',
       'To become a specialist in business  management, will I have to network myself?',
       'Are there any scholarships out there for students that are first generation and live in GA?'],
      dtype=object)

In [12]:
# Question title to find its similar questions
title_original ='I want to become an army officer. What can I do to become an army officer?'

In [13]:
# From title to question ID
q_id = q[q.questions_title_orig == title_original].questions_id.values
# From question ID to Index
idx = q_id_to_idx[q_id[0]]

In [14]:
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:31]

In [15]:
q_indices = [i[0] for i in sim_scores]

In [16]:
print(q.iloc[q_indices].questions_title_orig.values[0:5])

['how do i get in the U.S Marines?' 'What do you do in the army?'
 'what to do after tenth' 'How do I become an army officer?'
 'can  i go in the army']
