In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
import os
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
from bs4 import BeautifulSoup
from textblob import TextBlob
import re
import nltk
from nltk.tokenize import RegexpTokenizer
import inflect
from nltk.corpus import stopwords
import heapq

In [25]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper 

Obtaining bq_helper from git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper
  Cloning https://github.com/SohierDane/BigQuery_Helper to ./src/bq-helper
  Running command git clone -q https://github.com/SohierDane/BigQuery_Helper /content/src/bq-helper
Installing collected packages: bq-helper
  Running setup.py develop for bq-helper
Successfully installed bq-helper-0.4.1


In [None]:
## Get data from google bigquery

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/content/semantic-search-engine-c012790c0504.json"
# bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow")
# query = "SELECT q.id, q.title, q.body, q.tags, a.body as answers, a.score FROM `bigquery-public-data.stackoverflow.posts_questions` AS q INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a ON q.id = a.parent_id LIMIT 1000000"
# df = bq_assistant.query_to_pandas(query)

In [None]:
## Save data as csv file

# df.to_csv('/content/drive/MyDrive/Data/NLP/Original_data.csv')

In [None]:
## Check data top

# df.head()

## Load data from drive and perform operations for now

In [5]:
original_data = pd.read_csv(r'/content/drive/MyDrive/Data/NLP/Original_data.csv', index_col=0)

In [6]:
# Check total null counts columnwise
original_data.isna().sum()

id         0
title      0
body       0
tags       2
answers    0
score      0
dtype: int64

In [7]:
# Displaying data  only with tags = NaN
bool_series = pd.isnull(original_data['tags'])
original_data[bool_series]

Unnamed: 0,id,title,body,tags,answers,score
159208,62595774,Scraping With CSS Returning NULL in R & Python,<p>Question removed as it is no longer valid.</p>,,<p>The table's contents are loaded dynamically...,1
902834,7873714,a bit of difference between null and space,<p>I was just reading this quote</p>\n\n<block...,,"<p>null is binary 00000000, space is 00100000....",14


In [8]:
original_data[bool_series][['title','body']]

Unnamed: 0,title,body
159208,Scraping With CSS Returning NULL in R & Python,<p>Question removed as it is no longer valid.</p>
902834,a bit of difference between null and space,<p>I was just reading this quote</p>\n\n<block...


In [9]:
original_data.iloc[902834]['body']

"<p>I was just reading this quote</p>\n\n<blockquote>\n  <p>And don't tell me there isn't one bit of difference between null and\n  space, because that's exactly how much difference there is.  :-)</p>\n</blockquote>\n\n<p>by Larry wall\nCan anyone tell me what's the <code>bit</code> of difference he meant here between space and <code>null</code>?</p>"

In [10]:
original_data.iloc[159208]['body']

'<p>Question removed as it is no longer valid.</p>'

In [11]:
# In above 4 cells we understand that the questions can be dropped as tags are not available and questions are not valid as well.
original_data = original_data.dropna()

In [12]:
original_data.isna().sum()
# now data has no missing values

id         0
title      0
body       0
tags       0
answers    0
score      0
dtype: int64

In [13]:
original_data.shape

(999998, 6)

In [14]:
# Check if duplicate rows exists or not
original_data.duplicated().sum()

4

In [15]:
# Duplicate rows exist 4 (less in number can be removed)
original_data.drop_duplicates(keep = 'first', inplace = True)

In [16]:
# Duplicate rows do not exist
original_data.duplicated().any()

False

In [17]:
original_data.shape

(999994, 6)

### Preprocessing (As data is prepared)
- Tokenization of the text
- Converting the tokens to lowercase
- Removing the punctuation from the tokens list
- Removing the stopwords from the tokens list
- Tokenization of the code string

In [23]:
# Functions

# remove punctuations
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Remove all tags'''
    return TAG_RE.sub('', text)

def clean_text(text):
    text = re.sub("<p[^>]*>", "", text)
    text = re.sub("</?p[^>]*>", "", text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text = re.sub(r'\s+',' ',text)
    text = re.sub(r'[^\w\s]','',text)
    text = text.lower()
    text = re.sub(r'\d',' ',text)
    text = text.strip()
    return text

# remove stopwords, contractions and create categories
def remove_stopwords(text, stopwords = None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
        
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def tokenize_text(text):
    return RegexpTokenizer(r'\w+').tokenize_text(text)

def remove_code(text):
    '''Remove the code section and tags.'''
    soup = BeautifulSoup(text, 'lxml')
    if soup.code: soup.code.decompose() # remove code section
    return remove_tags(str(soup))

def preprocess(text):
    return remove_stopwords(clean_text(text))

def get_sentiment(text):
    return TextBlob(text).sentiment

def normalize_col(col):
    return (col - col.mean())/(max(col) - min(col))

In [19]:
new_cols = ['original_title', 
            'post_corpus', 
            'question_content', 
            'question_url', 
            'tags', 
            'answer_content', 
            'processed_title', 
            'sentiment_polarity', 
            'sentiment_subjectivity', 
            'overall_scores', 
            'overall_scores_norm']

### New columns (to be created for better dataset)
- drop id
- title - original title
- title + body + answer - post_corpus
- title + question - question content
- https://stackoverflow.com/questions/ + id - question url
- tags
- answer - answer_content
- processed_title
- sentiment_polarity
- sentiment_subjectivity
- score - overall_scores
- overall_scores_norm

In [20]:
content_token_df = pd.DataFrame(columns = new_cols)

In [21]:
original_data.columns

Index(['id', 'title', 'body', 'tags', 'answers', 'score'], dtype='object')

In [22]:
content_token_df.columns

Index(['original_title', 'post_corpus', 'question_content', 'question_url',
       'tags', 'answer_content', 'processed_title', 'sentiment_polarity',
       'sentiment_subjectivity', 'overall_scores', 'overall_scores_norm'],
      dtype='object')

In [None]:
# Rough Section

# content_token_df['original_title'] = original_data['title']
# content_token_df['post_corpus'] = (original_data['title']+" "+original_data['body']+" "+original_data["answers"]).apply(lambda x: remove_code(x)).apply(lambda x:preprocess(x))
# content_token_df['question_content'] = (original_data['title']+" "+original_data['body']).apply(lambda x: remove_code(x)).apply(lambda x:preprocess(x))
# content_token_df['question_url'] = original_data['id'].apply(lambda x: 'https://stackoverflow.com/questions/'+str(x))
# content_token_df['tags'] = original_data['tags']
# content_token_df['answer_content'] = original_data['answers'].apply(lambda x: remove_code(x)).apply(lambda x:preprocess(x))
# content_token_df['processed_title'] = original_data['title'].apply(lambda x:preprocess(x))
# content_token_df['overall_scores'] = original_data['score']
# content_token_df['sentiment_polarity'] = content_token_df['answer_content'].apply(lambda x: get_sentiment(x)[0])
# content_token_df['sentiment_subjectivity'] = content_token_df['answer_content'].apply(lambda x: get_sentiment(x)[1])
# content_token_df['overall_scores_norm'] = normalize_col(content_token_df['overall_scores'])

In [29]:
original_title = [] 
post_corpus = [] 
question_content = [] 
question_url = [] 
tags = [] 
answer_content = [] 
processed_title = [] 
sentiment_polarity = [] 
sentiment_subjectivity = [] 
overall_scores = [] 
overall_scores_norm = []

In [None]:
for i, row in tqdm(original_data.iterrows()):
    original_title.append(row.title) 
    post_corpus.append(preprocess(remove_code(row.title+" "+row.body+" "+row.answers))) 
    question_content.append(preprocess(remove_code(row.title+" "+row.body)))
    question_url.append('https://stackoverflow.com/questions/'+str(row.id)) 
    tags.append(row.tags) 
    answer_content.append(preprocess(remove_code(row.answers))) 
    processed_title.append(preprocess(row.title)) 
    sentiment_polarity.append(get_sentiment(preprocess(remove_code(row.answers)))[0]) 
    sentiment_subjectivity.append(get_sentiment(preprocess(remove_code(row.answers)))[1]) 
    overall_scores.append(row.score) 

959043it [3:16:07, 93.66it/s]

In [31]:
content_token_df['original_title'] = original_title 
content_token_df['post_corpus'] = post_corpus
content_token_df['question_content'] = question_content
content_token_df['question_url'] = question_url
content_token_df['tags'] = tags
content_token_df['answer_content'] = answer_content
content_token_df['processed_title'] = processed_title
content_token_df['overall_scores'] = overall_scores
content_token_df['sentiment_polarity'] = sentiment_polarity
content_token_df['sentiment_subjectivity'] = sentiment_subjectivity

In [32]:
content_token_df['overall_scores_norm'] = normalize_col(content_token_df['overall_scores'])

In [33]:
content_token_df.shape

(999994, 11)

In [34]:
# saved preprocessed data as csv for backup

# content_token_df.to_csv(path_or_buf='/content/drive/MyDrive/Data/NLP/preprocessed_data.csv', sep=',', )

In [3]:
preprocessed_data = pd.read_csv('/content/drive/MyDrive/Data/NLP/preprocessed_data.csv', index_col=0)

In [4]:
preprocessed_data = preprocessed_data.dropna()

In [5]:
preprocessed_data.isna().sum()

original_title            0
post_corpus               0
question_content          0
question_url              0
tags                      0
answer_content            0
processed_title           0
sentiment_polarity        0
sentiment_subjectivity    0
overall_scores            0
overall_scores_norm       0
dtype: int64

In [7]:
preprocessed_data.duplicated().sum()

17

In [8]:
preprocessed_data = preprocessed_data.drop_duplicates()

In [9]:
print(preprocessed_data.isna().sum())
print(preprocessed_data.duplicated().sum())
# Data is preprocessed and has no duplicate and null values

original_title            0
post_corpus               0
question_content          0
question_url              0
tags                      0
answer_content            0
processed_title           0
sentiment_polarity        0
sentiment_subjectivity    0
overall_scores            0
overall_scores_norm       0
dtype: int64
0


In [10]:
preprocessed_data.shape

(985352, 11)

Each post has a variable number of different tags. In order to narrow down the vast choices for a more accurate model, select 100 most common tags and filter only the data which contains at least one of most_common_tags.



In [11]:
preprocessed_data.tags = preprocessed_data.tags.apply(lambda x: x.split('|'))   # Convert raw text data of tags into lists

# Make a dictionary to count the frequencies for all tags
tag_freq_dict = {}
for tags in preprocessed_data.tags:
    for tag in tags:
        if tag not in tag_freq_dict:
            tag_freq_dict[tag] = 0
        else:
            tag_freq_dict[tag] += 1

In [12]:
len(tag_freq_dict) # total number of unique tags

39522

In [13]:
most_common_tags = heapq.nlargest(100, tag_freq_dict, key=tag_freq_dict.get)

In [31]:
final_indices = []
for i,tags in enumerate(preprocessed_data.tags.values.tolist()):
    if len(set(tags).intersection(set(most_common_tags)))>1:   # The minimum length for common tags should be 2 because 'python' is a common tag for all
        final_indices.append(i)
        
final_data = preprocessed_data.iloc[final_indices]

In [32]:
final_data.shape

(408611, 11)

In [33]:
final_data.tags = final_data.tags.apply(lambda x: '|'.join(x))

Index(['original_title', 'post_corpus', 'question_content', 'question_url',
       'tags', 'answer_content', 'processed_title', 'sentiment_polarity',
       'sentiment_subjectivity', 'overall_scores', 'overall_scores_norm'],
      dtype='object')

In [34]:
final_data.drop(['answer_content'], axis=1)

Unnamed: 0,original_title,post_corpus,question_content,question_url,tags,processed_title,sentiment_polarity,sentiment_subjectivity,overall_scores,overall_scores_norm
0,Print to hardcopy using javascript - landscape...,print hardcopy using javascript landscape mode...,print hardcopy using javascript landscape mode...,https://stackoverflow.com/questions/7722453,"[javascript, html, css]",print hardcopy using javascript landscape mode,0.258333,0.825000,10,0.000329
3,Primitive or wrapper for hibernate primary keys,primitive wrapper hibernate primary keys ive l...,primitive wrapper hibernate primary keys ive l...,https://stackoverflow.com/questions/3535791,"[java, hibernate, orm]",primitive wrapper hibernate primary keys,0.084722,0.606944,11,0.000371
4,Primitive or wrapper for hibernate primary keys,primitive wrapper hibernate primary keys ive l...,primitive wrapper hibernate primary keys ive l...,https://stackoverflow.com/questions/3535791,"[java, hibernate, orm]",primitive wrapper hibernate primary keys,0.024318,0.428561,35,0.001376
8,Timeouts with long running ASP.NET MVC Core Co...,timeouts long running aspnet mvc core controll...,timeouts long running aspnet mvc core controll...,https://stackoverflow.com/questions/37474309,"[c#, asp.net-core, asp.net-core-mvc, timeout]",timeouts long running aspnet mvc core controll...,0.166667,0.322222,47,0.001878
11,Grab first word in string after '\id',grab first word string id would grab first wor...,grab first word string id would grab first wor...,https://stackoverflow.com/questions/11472442,"[python, regex]",grab first word string id,0.000000,0.000000,11,0.000371
...,...,...,...,...,...,...,...,...,...,...
999988,How to create a list by conditioning a diction...,create list conditioning dictionary dictionary...,create list conditioning dictionary dictionary...,https://stackoverflow.com/questions/58025078,"[python, python-3.x, list, dictionary]",create list conditioning dictionary,0.266667,0.802469,-1,-0.000131
999989,How to suspend the typescript `unused` error c...,suspend typescript unused error consoles angul...,suspend typescript unused error consoles angul...,https://stackoverflow.com/questions/58128073,"[angular, typescript, angular8]",suspend typescript unused error consoles,0.000000,0.000000,-1,-0.000131
999990,I want to detect the volume of my speakers whi...,want detect volume speakers playing video vari...,want detect volume speakers playing video vari...,https://stackoverflow.com/questions/59588113,"[python, python-3.x, audio]",want detect volume speakers playing video vari...,0.395833,0.570833,-1,-0.000131
999991,How to finding the number of days between two ...,finding number days two days week find number ...,finding number days two days week find number ...,https://stackoverflow.com/questions/47160521,"[php, date, dayofweek, date-difference]",finding number days two days week,0.136364,0.454545,-1,-0.000131


In [35]:
preprocessed_data.shape

(985352, 11)

In [36]:
final_data.shape

(408611, 11)

In [37]:
# save the data (Data Preprocessed and Saved)

preprocessed_data.to_pickle('/content/drive/MyDrive/Data/NLP/Preprocessed_data.pkl')
final_data.to_pickle('/content/drive/MyDrive/Data/NLP/Preprocessed_data_mini.pkl')

---

# Rough

In [None]:
tokenize_text('a good man always tries to spread happiness')

In [30]:
# use case demo of clean_text, remove_stopwords and tokenize_text functions
def sd(l):
    return l*2

bb = pd.DataFrame({'a':['<p>nvdvvbdb@ ram eats mango<p>','<p>a the AsdfERTT14/@</p>', 'angry man'], 'b':['man', 'good', 'bad']})
bb.a = bb.a.apply(lambda x: clean_text(x))
bb.a = bb.a.apply(lambda y: remove_stopwords(y))
bb['c'] = bb.b.apply(lambda x: get_sentiment(x)[0])
bb['d'] = [5,6,7]
bb['d'] = sd(bb['d'])
# bb.a = bb.a.apply(lambda z: tokenize_text(z))

bb
# normalize_col(bb.d)

cc = bb.iloc[[0,2]]
cc

Unnamed: 0,a,b,c,d
0,nvdvvbdb ram eats mango,man,0.0,10
2,angry man,bad,-0.7,14


In [None]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

bb = pd.DataFrame({'a':["<pre><p>nvdvvbdb@ </p>xclusive<code>print('h')</code>ram eats mango<pre>",'<p>a the AsdfERTT14/@</p>'], 'b':['man', 'go']})

bb['c'] = (bb['a'] + ' ' + bb['b']).apply(lambda x: remove_code(x)).apply(lambda x:preprocess(x))
bb

In [None]:
remove_code("<pre><p>nvdvvbdb@ </p>xclusive<code>print('h')</code>ram eats mango<pre>")

In [None]:
get_sentiment('good boy')

In [None]:
for i, row in tqdm(bb.iterrows()):
    print(i, row.b)

---

In [None]:
# Cleaning post_corpus in content dataframe

content['post_corpus'] = content.post_corpus.apply(lambda x: clean_text(x))
content['post_corpus'] = content.post_corpus.apply(lambda x: remove_stopwords(x))
content.iloc[:,7:].head()

Each post has a variable number of different tags. In order to narrow down the vast choices for a more accurate model, select 100 most common tags. 
Filter only the data which contains at least one of most_common_tags.

In [None]:
content_token_df.tags = content_token_df.tags.apply(lambda x: x.split('|'))   # Convert raw text data of tags into lists

# Make a dictionary to count the frequencies for all tags
tag_freq_dict = {}
for tags in content_token_df.tags:
    for tag in tags:
        if tag not in tag_freq_dict:
            tag_freq_dict[tag] = 0
        else:
            tag_freq_dict[tag] += 1