In [None]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Text processing
from bs4 import BeautifulSoup as bs
import os
import xml.etree.ElementTree as ET
from collections import defaultdict

# Colab integration libs
from google.colab import drive, files

# GPT 3 integration
!pip install openai
import openai

%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.25.0.tar.gz (44 kB)
[K     |████████████████████████████████| 44 kB 1.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.2.0.62-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 6.3 MB/s 
Building wheels for collected packages: openai
  Building wheel for openai (PEP 517) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.25.0-py3-none-any.whl size=55880 sha256=9b197fde10ea5d1fc98533c4322eccb8591b590f346e80af273dd2f7281fe3d4
  Stored in directory: /root/.cache/pip/wheels/19/de/db/e82770b480ec30fd4a6d67108744b9c52be167c04fcf4af7b5
Successfully built openai
Installing collected packages: pandas-stubs, openai
Successfully in

In [None]:
# !pip install transformers
# !pip install sentencepiece
import torch
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 35.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 29.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K  

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-xxl', do_lower_case = True)

special_tokens_dict = {'additional_special_tokens': ['{code-blk}']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [None]:
# Global Variables
code_tag = 'code'
min_lines_of_code = 20

In [None]:
# integration of drive
drive.mount('/content/drive')

# change the directory to the dataset folder
os.chdir("/content/drive/MyDrive/nlp_project")

Mounted at /content/drive


## Step 1: Data Preparation



### Data Import and creation of Data-Frame

In [None]:
# Trees of the the files
tree_1 = ET.parse('meta_file/Posts.xml')
root_1 = tree_1.getroot()

tree_2 = ET.parse('stackexchange_file/Posts.xml')
root_2 = tree_2.getroot()

In [None]:
def create_df(root):
  # Creation of DataFrame
  json_data = defaultdict(list)

  # set of all the attributes in the data
  keys = {key for child in root for key in child.attrib.keys()}

  # Json form of all the data
  json_data = {key:[row.attrib.get(key, None) for row in root ] for key in keys}

  # Pandas DataFrame form of all the Data
  data_frame = pd.DataFrame(data=json_data)
  data_frame.set_index('Id', inplace=True)

  return data_frame

df_1 = create_df(root_1)
df_2 = create_df(root_2)

In [None]:
print(df_1.columns, df_2.columns)

Index(['ContentLicense', 'LastEditDate', 'Score', 'CommunityOwnedDate', 'Body',
       'Tags', 'AnswerCount', 'LastActivityDate', 'AcceptedAnswerId', 'Title',
       'PostTypeId', 'CommentCount', 'LastEditorDisplayName', 'OwnerUserId',
       'CreationDate', 'ParentId', 'OwnerDisplayName', 'FavoriteCount',
       'LastEditorUserId', 'ClosedDate', 'ViewCount'],
      dtype='object') Index(['ContentLicense', 'LastEditDate', 'Score', 'CommunityOwnedDate', 'Body',
       'Tags', 'AnswerCount', 'AcceptedAnswerId', 'LastActivityDate', 'Title',
       'PostTypeId', 'CommentCount', 'LastEditorDisplayName', 'OwnerUserId',
       'CreationDate', 'ParentId', 'OwnerDisplayName', 'FavoriteCount',
       'LastEditorUserId', 'ClosedDate', 'ViewCount'],
      dtype='object')


In [None]:
df_1

Unnamed: 0_level_0,ContentLicense,LastEditDate,Score,CommunityOwnedDate,Body,Tags,AnswerCount,LastActivityDate,AcceptedAnswerId,Title,...,CommentCount,LastEditorDisplayName,OwnerUserId,CreationDate,ParentId,OwnerDisplayName,FavoriteCount,LastEditorUserId,ClosedDate,ViewCount
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,CC BY-SA 2.5,2010-09-03T00:42:07.733,32,,<p>I'm looking at the questions proposed durin...,<discussion><answers>,7,2014-04-23T09:14:37.103,,Why would anyone accept an answer?,...,4,,4,2010-09-01T19:32:45.710,,,5,99,,1846
2,CC BY-SA 2.5,2015-03-18T19:19:24.887,7,2010-09-02T03:42:26.083,<p>One of the big 7 questions.</p>\n,<discussion><site-attributes><faq-contents><to...,4,2015-03-18T19:19:24.887,,What should our FAQ contain?,...,0,,9,2010-09-01T19:34:51.797,,,,25936,,331
3,CC BY-SA 2.5,2020-06-16T10:01:31.710,15,2010-09-02T03:40:00.467,<blockquote>\n<p><strong>Possible Duplicate:</...,<discussion><top-7><site-attributes>,32,2014-04-23T09:14:37.103,,What should our domain name be?,...,8,,9,2010-09-01T19:36:08.390,,,8,-1,2010-10-08T21:02:50.313,2125
4,CC BY-SA 2.5,2017-03-16T15:43:44.307,12,,<p>One of the big 7 questions.</p>\n\n<ul>\n<l...,<discussion><site-attributes><top-7>,4,2010-10-23T17:13:45.967,,What should our logo and site design look like?,...,6,,9,2010-09-01T19:37:08.953,,,5,-1,,715
5,CC BY-SA 2.5,2017-03-16T17:21:05.530,6,2010-09-02T03:32:56.437,<blockquote>\n <p><strong>Possible Duplicate:...,<discussion><top-7><moderators>,9,2010-10-03T17:50:47.680,,Who should the moderators be?,...,14,,9,2010-09-01T19:38:15.610,,,6,-1,2012-02-05T04:24:03.887,1143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9375,CC BY-SA 4.0,2022-05-29T13:02:47.683,0,,,,,2022-05-29T13:02:47.683,,,...,0,,-1,2022-05-29T13:02:47.683,,,,-1,,
9376,CC BY-SA 4.0,2022-05-29T13:02:47.683,0,,,,,2022-05-29T13:02:47.683,,,...,0,,-1,2022-05-29T13:02:47.683,,,,-1,,
9377,CC BY-SA 4.0,2022-05-29T13:03:20.793,0,,,,,2022-05-29T13:03:20.793,,,...,0,,-1,2022-05-29T13:03:20.793,,,,-1,,
9378,CC BY-SA 4.0,2022-05-29T13:03:20.793,0,,,,,2022-05-29T13:03:20.793,,,...,0,,-1,2022-05-29T13:03:20.793,,,,-1,,


In [None]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7488 entries, 1 to 9380
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ContentLicense         7488 non-null   object
 1   LastEditDate           4625 non-null   object
 2   Score                  7488 non-null   object
 3   CommunityOwnedDate     333 non-null    object
 4   Body                   7488 non-null   object
 5   Tags                   2607 non-null   object
 6   AnswerCount            2607 non-null   object
 7   LastActivityDate       7488 non-null   object
 8   AcceptedAnswerId       1146 non-null   object
 9   Title                  2608 non-null   object
 10  PostTypeId             7488 non-null   object
 11  CommentCount           7488 non-null   object
 12  LastEditorDisplayName  490 non-null    object
 13  OwnerUserId            6389 non-null   object
 14  CreationDate           7488 non-null   object
 15  ParentId               481

### Extraction of Code and Summary from Body Tag

In [None]:
# list of all unique tags in the body of all rows:
tags = {tag.name for body in df_1['Body'] for tag in bs(body, 'html.parser').find_all()}
print(f"Tags in the Body are: {tags}")

tags = {tag.name for body in df_2['Body'] for tag in bs(body, 'html.parser').find_all()}
print(f"Tags in the Body are: {tags}")


Tags in the Body are: {'sup', 'code', 'blockquote', 'tr', 'em', 'h1', 'sub', 'strike', 'p', 'ol', 'i', 'hr', 's', 'b', 'th', 'h3', 'pre', 'table', 'a', 'tbody', 'li', 'div', 'ul', 'br', 'strong', 'h2', 'kbd', 'td', 'img', 'thead', 'del'}
Tags in the Body are: {'sup', 'code', 'blockquote', 'tr', 'em', 'h1', 'sub', 'p', 'strike', 'i', 'ol', 'h5', 'hr', 'dt', 's', 'b', 'th', 'dd', 'h3', 'dl', 'pre', 'table', 'a', 'tbody', 'li', 'div', 'ul', 'br', 'strong', 'h2', 'kbd', 'td', 'h4', 'img', 'thead', 'del'}


In [None]:
df_sec_terms = pd.read_csv('data/security_keywords.csv')

In [None]:
security_terms = df_sec_terms['WORD'].to_list()

In [None]:
example = """<body>
 <p class="title"><b>Body's title</b></p>
 <p class="story">line begins
       <a class="element" href="http://example.com/element1" id="link1">1</a>
       <div>
       <code>my sexy code
          chirag's code
          chirag's sexy code 
          chirag's sexy code snippets
          </code>
       <code>my sexy code</code>
      </div>       
 <a class="element" href="http://example.com/element2" id="link2">2<code> my code</code></a>
 <a class="element" href="http://example.com/element3" id="link3">3</a>
 <p> line ends</p>
 <p><img link="httplink"></p>
 </p></body>
 <b>Body's title</b>"""
 
# These tags have linkes not required in summary 
non_ess_tags = ['a', 'img']
import re
def replaceCodeWithMask(text):
  return re.sub("\<code\>.*\<\/code\>","{code-blk}",text,1,re.DOTALL)

# Returns list of parsed code of length greater than min_length and clean text
def code_text_extractor(text, non_ess_tags):
  soup = bs(text, "html.parser")
  code = soup.find_all(['code'])

  code_output = [item.text for item in code if item.text.count('\n') > 2]
  text = replaceCodeWithMask(text)
  
  soup = bs(text, "html.parser")
  non_essential_tags = soup.find_all(non_ess_tags)
  
  # for item in code: item.decompose()
  for item in non_essential_tags: item.decompose()
  
  # Note: Later other tags can be removed adding other redundunt tags with 'code'
  
  texts = soup.get_text().split('\n')
  texts = [text.strip() for text in texts if text]
  return code_output, [text for text in texts if text]

# code_text_extractor(example, non_ess_tags)


def count_sec_terms(text):
  count = 0
  for item in security_terms:
    if item in text: count += 1
  return count

list_of_code_construct = { x :True for x in ['{','}','[',']','import','#include','return','(',')','def','function','class','break','continue','if','else','define',';','++','--','==','<=','>=']}
def has_code_construct(text):
  for keyword in list_of_code_construct:
    if keyword in text: return True
  return False




# has_code_construct(text)
# Example Output
# (['my sexy code\n          CHirag\n          CHirag \n          chirag\n          '],
#  ["Body's title", 'line begins', '1', '2', '3', 'line ends', "Body's title"])

False

In [None]:
# Case 1: Code from Post and Summary from Post
def post_se_code_and_summary(data_frame):
  # The data with postType Id as 1 is the actual post and has 
  data = data_frame[data_frame['PostTypeId'] == '1']
  CODE, TEXT = [],[]
  titles = data_frame.Title
  scores = data_frame.Score
  CODE, TEXT, code_token_lengths, text_token_lengths, sec_count,code_const = [],[],[],[],[],[]
  
  # Get all the code and text from the body tag
  for body, title, score in zip(data['Body'], titles, scores):
    codes, texts = code_text_extractor(body, non_ess_tags)
    
    # Skip if codes is empty  
    if not codes: continue
    CODE.append("\n".join(codes))
    TEXT.append(" ".join(texts))
    text_token_lengths.append(len(tokenizer.tokenize(TEXT[-1])))
    code_token_lengths.append(len(tokenizer.tokenize(CODE[-1])))
    code_const.append(has_code_construct(CODE[-1]))
    sec_count.append(count_sec_terms(TEXT[-1]))
    
    
  return pd.DataFrame.from_dict({'CODE': CODE, 'TEXT': TEXT, 'code_token_length': code_token_lengths,'text_token_length':text_token_lengths, 'security_terms_count':sec_count, 'has_code_construct': code_const})

In [None]:
# Case 2: Code from Post and Summary from Answer
def post_se_code_and_answer_se_summmary(data_frame):
  data_code = data_frame[(data_frame['PostTypeId'] == '1') & data_frame.AcceptedAnswerId.notna()]
  data_summary = data_frame.loc[data_code.AcceptedAnswerId]
  titles = data_frame.Title.values
  scores = data_frame.Score
  CODE, TEXT, TITLE, SCORE = [],[],[],[]

  for body_1, body_2, title, score in zip(data_code['Body'], data_summary['Body'], titles, scores):
    codes, _ = code_text_extractor(body_1, non_ess_tags)
    _, texts = code_text_extractor(body_2, non_ess_tags)
    # Skip if codes is empty  
    if not codes: continue
    CODE.append("\n".join(codes))
    TEXT.append(" ".join(texts))
    TITLE.append(title)
    SCORE.append(score)
  
  return pd.DataFrame.from_dict({'SCORE':SCORE, 'TITLE':TITLE,'CODE': CODE, 'TEXT': TEXT})

In [None]:
# Case 3: Code from Answer and Summary from Answer
def answer_se_code_and_summary(data_frame):
  # The data with postType Id as 1 is the actual post and has 
  data = data_frame[data_frame['PostTypeId'] == '2']
  titles = data_frame['Title']
  scores = data_frame.Score
  CODE, TEXT, code_token_lengths, text_token_lengths, sec_count,code_const = [],[],[],[],[],[]
  
  # Get all the code and text from the body tag
  for body,title, score in zip(data['Body'],titles, scores):
    codes, texts = code_text_extractor(body, non_ess_tags)
    
    # Skip if codes is empty  
    if not codes: continue
    CODE.append("\n".join(codes))
    TEXT.append(" ".join(texts))
    text_token_lengths.append(len(tokenizer.tokenize(TEXT[-1])))
    code_token_lengths.append(len(tokenizer.tokenize(CODE[-1])))
    code_const.append(has_code_construct(CODE[-1]))
    sec_count.append(count_sec_terms(TEXT[-1]))
    

  return pd.DataFrame.from_dict({'CODE': CODE, 'TEXT': TEXT, 'code_token_length': code_token_lengths,'text_token_length':text_token_lengths, 'security_terms_count':sec_count, 'has_code_construct': code_const})

In [None]:
# Case 4: Code from Answer and Summary from Post
def answer_se_code_and_post_se_summary(data_frame):
  data_code = data_frame[(data_frame['PostTypeId'] == '2') & data_frame.ParentId.notna()]
  data_summary = data_frame.loc[data_code.ParentId]
  titles = data_frame.Title
  scores = data_frame.Score
  CODE, TEXT, TITLE, SCORE = [],[],[],[]

  for body_1, body_2, title, score in zip(data_code['Body'], data_summary['Body'], titles, scores):
    codes, _ = code_text_extractor(body_1, non_ess_tags)
    _, texts = code_text_extractor(body_2, non_ess_tags)
    # Skip if codes is empty  
    if not codes: continue
    CODE.append("\n".join(codes))
    TEXT.append(" ".join(texts))
    TITLE.append(title)
    SCORE.append(score)
  
  return pd.DataFrame.from_dict({'SCORE':SCORE, 'TITLE':TITLE,'CODE': CODE, 'TEXT': TEXT})

In [None]:
#  Accumulating all the processed Dataframes ccomment one and change the tree to store both the datasets as the code is written for one dataframe.
processed_df_1 = pd.concat([ post_se_code_and_summary(df_1),answer_se_code_and_summary(df_1)])
processed_df_2 = pd.concat([ post_se_code_and_summary(df_2),answer_se_code_and_summary(df_2)])

In [None]:
processed_df_1.shape

(12, 4)

In [None]:
processed_df_2

(7797, 4)

In [None]:
# os.makedirs('data_check', exist_ok=True)  
final_df = pd.concat([processed_df_1, processed_df_2]).reset_index(drop = True)

In [None]:
final_df.info()
print(f"\n\nIn the entire dataset,{[ column for column in df_2.columns if df_2[column].isna().sum() == 0 ]} are the attributes which are non-null throughout the dataset.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7809 entries, 0 to 7808
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SCORE   7809 non-null   object
 1   TITLE   1777 non-null   object
 2   CODE    7809 non-null   object
 3   TEXT    7809 non-null   object
dtypes: object(4)
memory usage: 244.2+ KB


In the entire dataset,['ContentLicense', 'Score', 'Body', 'LastActivityDate', 'PostTypeId', 'CommentCount', 'CreationDate'] are the attributes which are non-null throughout the dataset.


In [None]:
final_df.to_csv('data/final_df.csv')