Import Libraries
--

In [1]:
import json, csv
import numpy as np
import pandas as pd
import re
import warnings

warnings.filterwarnings('ignore')

print("Libraries Imported")

Libraries Imported


In [0]:
def getText(doc):
  doc =  str(doc)
  doc = doc.lower().strip()
  doc = re.sub('\n', ' ', doc)
  doc = re.sub(r'\s+', ' ', doc)
  m = re.search(r'<meta property=\"og:title\" content=\"(.*?)\"/>',doc)
  m1 = re.search(r'<meta property=\"og:description\" content=\"(.*?)\"/>',doc)

  if m != None and m1!= None:
    text = str(m.group(1)) + ' ' + str(m1.group(1))
  else:
    text  = "No match"

  return text

Preparing Datasets
--

In [0]:
data = pd.read_json('eval_cleaned.json')

for i, row in data.iterrows():
  if re.match(r"^\"/><meta property=\"", row['original_text']) == None:
    text = getText(row['original_text'])
    if text != "No match":
      data.at[i,'text'] = text
  items = row['equations'].split("\r\n")
  data.at[i,'nequ'] = len(items)

In [0]:
data = data.loc[data['nequ'] == 2]

data[["unknowns","equations"]] = data["equations"].str.split("\r\n", 1, expand = True)

data["unknowns"] = data["unknowns"].str[6:]

data["equations"] = data["equations"].str[5:]

data = data[["text","ans","equations","unknowns"]]

In [5]:
data = data.dropna(subset=["equations"])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6625 entries, 19 to 14722
Data columns (total 4 columns):
text         6625 non-null object
ans          6625 non-null object
equations    6625 non-null object
unknowns     6625 non-null object
dtypes: object(4)
memory usage: 258.8+ KB


Data Cleaning
--

In [6]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

import spacy

# import string

import re

nlp = spacy.load("en")

nltk_stopwords = set(stopwords.words('english'))

spacy_stopwords = nlp.Defaults.stop_words

stopset = nltk_stopwords.union(spacy_stopwords)

stopset.difference_update(["a","more","less","than","one","two","three","four","five","six","seven","eight","nine","ten","eleven","twelve","fifteen","twenty","forty","sixty","fifty","hundred","once","first","second","third"])

punctuation = "!\"#$&',;?@\_`{|}~"

def getText(doc):
  doc =  str(doc)
  doc = doc.lower().strip()
  doc = re.sub('\n', ' ', doc)
  doc = re.sub(r'\s+', ' ', doc)
  m = re.search(r'<meta property=\"og:title\" content=\"(.*?)\"/>',doc)
  m1 = re.search(r'<meta property=\"og:description\" content=\"(.*?)\"/>',doc)

  if m != None and m1!= None:
    text = str(m.group(1)) + ' ' + str(m1.group(1))
  else:
    text  = "No match"

  return text


def cleanData(doc):
  doc = str(doc)
  doc = doc.lower().strip()
  doc = re.sub('\n', ' ', doc)
  doc = re.sub(r'\s+', ' ', doc)
  pattern = '"/><meta '
  lookup = re.search(pattern, doc)
  if lookup != None:
    index = doc.find(lookup.group(0))
    doc = doc[:index]
  doc = doc.replace('yahoo answers','')
  regex1 = r"[A-Za-z\s*]+[\.:%-][A-Za-z\s*]*"
  regex2 = r"[0-9]+\.[0-9]+"
  regex3 = r"[0-9\s*]{1,}[\=\*+-][0-9\s*]{1,}"
  match1 = re.search(regex1, doc)
  match2 = re.search(regex2, doc)
  match3 = re.search(regex3, doc)
  # tokens = nltk.WordPunctTokenizer().tokenize(doc)      
  tokens = doc.split()
  # clean = [token.lower().strip() for token in tokens if token not in stopset]
  clean = [token for token in tokens if token not in stopset]
  clean = [token.replace('.','') for token in clean if not re.search(r"[0-9]+\.[0-9]+", token)]
  clean = " ".join(clean)
  # clean = [''.join(c for c in s if c not in punctuation) for s in clean]
  final = ''.join(c for c in clean if c not in punctuation)
  # final_tokens = final.split()
  # clean2 = [token.replace('.','') for token in final_tokens if not re.search(r"[0-9]+\.[0-9]+", token)]
  # final = " ".join(clean2)
  # final = " ".join(clean)
  try:
      final = remove_whitespace(final.encode('latin1').decode('utf-8','replace').encode('ascii','ignore'))
  except:
      final = remove_whitespace(final)
  final=final.strip()
  return final

def remove_whitespace(x):
  """
  Helper function to remove any blank space from a string
  x: a string
  """
  try:
      # Remove spaces inside of the string
      x = " ".join(x.split())
  except:
      pass
  return x
  
print("Functions Defined!")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Functions Defined!


In [7]:
data['cleaned_text'] = data.apply(lambda x: cleanData(str(x['text'])), axis=1)

data['cleaned_text'] = data['cleaned_text'].str.decode("utf-8")

data = data.dropna(subset=["cleaned_text"])

data.info()

# data.to_csv("trainData_univariable.csv", index = False)
# data.to_csv("trainData_univariable.txt", index = False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6487 entries, 19 to 14722
Data columns (total 5 columns):
text            6487 non-null object
ans             6487 non-null object
equations       6487 non-null object
unknowns        6487 non-null object
cleaned_text    6487 non-null object
dtypes: object(5)
memory usage: 304.1+ KB


Data Modelling (Archieve)
--

In [0]:
import pandas as pd

data = pd.read_csv('new_cleaned_data.csv')

from sklearn.model_selection import train_test_split

trainData, testData = train_test_split(data, test_size = 0.2)

trainData.rename(columns={'cleaned text': 'cleaned_text'}, inplace=True)

testData.rename(columns={'cleaned text': 'cleaned_text'}, inplace=True)

trainData = trainData.reset_index(drop=True)

testData = testData.reset_index(drop=True)

print(trainData.info())

print(testData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1896 entries, 1302 to 771
Data columns (total 5 columns):
ans             1896 non-null object
cleaned_text    1896 non-null object
equations       1896 non-null object
text            1896 non-null object
unknowns        1895 non-null object
dtypes: object(5)
memory usage: 88.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 1928 to 1444
Data columns (total 5 columns):
ans             474 non-null object
cleaned_text    474 non-null object
equations       474 non-null object
text            474 non-null object
unknowns        471 non-null object
dtypes: object(5)
memory usage: 22.2+ KB
None


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

# print(data.info())

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words= None)

tfidf.fit(trainData['cleaned_text'])

features = tfidf.transform(trainData['cleaned_text']).toarray()

# test = "Three times the first of three consecutive odd integers is 3 more than twice the third . What is the third integer ?"

# testClean = cleanData(test)

# print(testClean)

# test_feature = tfidf.transform([testClean]).toarray()

test_features = tfidf.transform(testData['cleaned_text']).toarray()

# print(test_features)

# print(features)

In [0]:
testData['matchedQuestion'] = ''
testData['matchedEq'] = ''

print(trainData.info())

print(testData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1896 entries, 0 to 1895
Data columns (total 5 columns):
ans             1896 non-null object
cleaned_text    1896 non-null object
equations       1896 non-null object
text            1896 non-null object
unknowns        1895 non-null object
dtypes: object(5)
memory usage: 74.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 7 columns):
ans                474 non-null object
cleaned_text       474 non-null object
equations          474 non-null object
text               474 non-null object
unknowns           471 non-null object
matchedQuestion    474 non-null object
matchedEq          474 non-null object
dtypes: object(7)
memory usage: 26.0+ KB
None


In [0]:
from scipy import spatial

score = 0
index = 0


for i, row1 in testData.iterrows():
  score = 0
  for j, row2 in trainData.iterrows():
    similarity = 1 - spatial.distance.cosine(test_features[i], features[j])
    if similarity > score:
      score = similarity
      testData.at[i,'matchedQuestion'] = row2['cleaned_text']
      testData.at[i, 'matchedEq'] = row2['equations']

In [0]:
testData.info()

testData.to_csv("cosineSimilarity.csv", index = False)

In [0]:
from scipy import spatial

score = 0
index = 0

def similarity(sen1, sen2):
  score = np.dot(sen1, sen2)/(np.linalg.norm(sen1)*np.linalg.norm(sen2))
  return score

for i, row1 in testData.iterrows():
  score = 0
  for j, row2 in trainData.iterrows():
    similarity = 1 - similarity(test_features[i], features[j])
    if similarity > score:
      score = similarity
      testData.at[i,'matchedQuestion'] = row2['cleaned_text']
      testData.at[i, 'matchedEq'] = row2['equations']

In [0]:
testData.info()

testData.to_csv("generalSimilarity.csv", index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 7 columns):
ans                474 non-null object
cleaned_text       474 non-null object
equations          474 non-null object
text               474 non-null object
unknowns           471 non-null object
matchedQuestion    474 non-null object
matchedEq          474 non-null object
dtypes: object(7)
memory usage: 26.0+ KB


In [0]:
from math import *

score = 0
index = 0

def jaccard_similarity(sen1, sen2):
  intersection = len(set.intersection(*[set(sen1), set(sen2)]))

  union = len(set.union(*[set(sen1), set(set2)]))

  score = intersection/float(union)

  return score

for i, row1 in testData.iterrows():
  score = 0
  for j, row2 in trainData.iterrows():
    similarity = jaccard_similarity(test_features[i], features[j])
    if similarity > score:
      score = similarity
      testData.at[i,'matchedQuestion'] = row2['cleaned_text']
      testData.at[i, 'matchedEq'] = row2['equations']

In [0]:
testData.info()

testData.to_csv("jaccardSimilarity.csv", index = False)