##Dataset

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')
questions = '/content/gdrive/MyDrive/StackOverflow_data/Questions.csv'
answers = '/content/gdrive/MyDrive/StackOverflow_data/Answers.csv'

""" 
Link to download dataset:
questions = "https://www.kaggle.com/datasets/stackoverflow/stacksample?select=Questions.csv"
answers = "https://www.kaggle.com/datasets/stackoverflow/stacksample?select=Answers.csv"
"""

## Importing libraries

In [None]:
import csv	# DONOT use pandas here as it loads all of data into RAM at once.
import re
import time
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

## Data type

In [None]:
# Storing headers of Questions.csv & Returing Type of header
quesHeader = []
with open(questions, encoding="latin1") as csvfile:
	readCSV = csv.reader(csvfile, delimiter=',', )
	quesHeader = next(readCSV)
print(quesHeader)

for x in range(len(quesHeader)):
  print(type(quesHeader[x]))

['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title', 'Body']
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [None]:
# storing headers of Answers.csv & Returing Type of header
ansHeader = []
with open(answers, encoding="latin1") as csvfile:
	readCSV = csv.reader(csvfile, delimiter=',', )
	ansHeader = next(readCSV)	
print(ansHeader)

for x in range(len(ansHeader)):
  print(type(ansHeader[x]))

['Id', 'OwnerUserId', 'CreationDate', 'ParentId', 'Score', 'Body']
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


## Data dimension

In [None]:
#Counting rows & columns in Question.csv
cnt=0
with open(questions, encoding="latin1") as csvfile:
	readCSV = csv.reader(csvfile, delimiter=',', )
	next(readCSV, None)  # skip the headers
	for row in readCSV:
		cnt += 1 # keep count of # rows processed
print(cnt,len(row))

1264216 7


In [None]:
#Counting rows & columns in Answers.csv
cnt=0
with open(answers, encoding="latin1") as csvfile:
	readCSV = csv.reader(csvfile, delimiter=',')
	next(readCSV, None)  # skip the headers
	for row in readCSV:
		cnt += 1 # keep count of # rows processed
print(cnt,len(row))

2014516 6


##Data Cleaning

In [None]:
def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text )
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
    text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"won't", "will not ", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text

"""
# If we use title & body combine to embed then lemmatize will be useful
def lemmatize(text):
    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    lemma_set = ()
    for word in tokens:
      lemma_set.append(lemmatizer.lemmatize(word, pos = 'a'))
    return lemma_set
"""

def process_all(text):
    text = process_text(text)
    return ' '.join(remove_stopwords(stopwords.words('english'), text.split()))

## Vectorizing text

In [None]:
# Using USE to vectorize Questions-title :(SEMANTIC SEARCH)
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
Data = {}
count = 0
limit = 100000 #reading only first 100K rows
loading = 10
print("Processing......" + str(loading))

with open(questions, encoding="latin1") as csvfile:
  readCSV = csv.reader(csvfile, delimiter=',' )
  next(readCSV, None)  # skip the headers 
  for row in readCSV:
    doc_id = row[0]; # Col 0: ID(of Question)
    title = row[5]; # Col 5: Title (of Question)
    vec = tf.make_ndarray(tf.make_tensor_proto(embed([title]))).tolist()[0]
    b = {
        "title":title,
			  "title_vector":vec,
        "body":row[6], # Col 6 : Question Body
        }
    Data[doc_id] = b
    count += 1
    if (count % 10000 == 0):
      loading -= 1
      print("Processing......" + str(loading))
      
    if (count == limit):
      break

Processing......10
Processing......9
Processing......8
Processing......7
Processing......6
Processing......5
Processing......4
Processing......3
Processing......2
Processing......1
Processing......0


## Sample output

In [None]:
#Note : QuestionID's aren't 0-indexed
print(type(Data['650']))
print(Data['650'])

print(type(Data['650']['title_vector']))
print(Data['650']['title_vector'])

""" Print all QuestionId's
for key in Data:
  print(key)
"""

<class 'dict'>
{'title': 'Automatically update version number', 'title_vector': [-0.03705509752035141, 0.003920395392924547, -0.07614196836948395, 0.009932375513017178, -0.03474748879671097, 0.07547452300786972, 0.0456756055355072, -0.0018333467887714505, 0.048031385987997055, -0.021722355857491493, -0.05907668173313141, -0.01282587181776762, 0.04410664364695549, 0.014755098149180412, 0.06392039358615875, 0.08911824971437454, -0.048548370599746704, -0.015545995905995369, 0.002292304765433073, 0.007501461077481508, -0.062275249511003494, -0.055971305817365646, 0.016821833327412605, 0.020593838766217232, 0.0422888845205307, 0.03527801111340523, 0.05853908509016037, 0.049767326563596725, 0.020873934030532837, 0.04388683661818504, 0.024513913318514824, -0.02000460959970951, -0.023687299340963364, -0.056667983531951904, -0.0501432791352272, 0.052422355860471725, -0.028463320806622505, -0.057596009224653244, 0.010912954807281494, -0.002899251179769635, -0.003816876094788313, -0.0185493696480

" Print all QuestionId's\nfor key in Data:\n  print(key)\n"

## Model fitting

In [None]:
# Ask for vector X (to find simliar vectors to it) & return this no. of top_result
from heapq import heappop, heappush, heapify
from numpy.linalg import norm

def cosine_sim(X, top_result):
  heap = [] #store most relevant search results
  heapify(heap)
  
  #iterating over all dictonary/JSON objects
  for key in Data:
    Y = Data[key]['title_vector']
    cosine = np.dot(X,Y)/(norm(X)*norm(Y))
    heappush(heap, (-1 * cosine, key))
  
  heappop(heap) # removing same element
  top = []
  for x in range(top_result):
    top.append((-1 * heap[x][0], heap[x][1])) #2-braces --> convert to tuple
  
  return top

## Output

In [None]:
query = input("Enter query: ")
vector = tf.make_ndarray(tf.make_tensor_proto(embed([query]))).tolist()[0]
result = cosine_sim(vector, 10)

print("Relevancy : \n")
ansId = []
for x in result:
  ansId.append(x[1])
  print("Q. " + Data[x[1]]['title'])

Enter query: read file in python
Relevancy : 

Q. Read and overwrite a file in Python
Q. Read file object as string in python
Q. Python: Read a file (from an external server)
Q. Reading bytestreams in Python
Q. reading .bash_history file through python script
Q. Writing Strings to files in python
Q. edit text file using Python
Q. Python, find a file in the same directory
Q. Reading file using python and and see if a particular string is there inthe file
Q. Declare function at end of file in Python


In [None]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
# answers.csv = ['Id', 'OwnerUserId', 'CreationDate', 'ParentId', 'Score', 'Body']
top = 10
sol = []
with open(answers, encoding="latin1") as csvfile:
  readCSV = csv.reader(csvfile, delimiter=',' )
  next(readCSV, None)  # skip the headers 
  for row in readCSV:
    if (top == 0):
      break
    doc_id = int(row[3]); # Col 3: ID(parentId of question)
    for id in ansId:
      if int(id) == doc_id:
        ans = remove_html_tags(row[5])
        print(id, ": ", ans)
        sol.append(ans)
        top -= 1

In [None]:
dictionary = {'question': query, 'solution': sol}

try:
    ansFile = open('ansFile.txt', 'wt')
    ansFile.write(str(dictionary))
    ansFile.close()
  
except:
    print("Unable to write to file")