In [1]:
import pandas as pd
import cufflinks as cf

cf.go_offline()

In [2]:
requirement_relevancy_dataset = pd.read_csv(
    "../../Datasets/irrelevant_requirements_dataset/irrelevant_requirements_dataset.csv",
    engine="pyarrow",
)

requirement_relevancy_dataset.head()

Unnamed: 0,reqs_statement,action_part,actor_part,label
0,user submit job associate cost execution time ...,submit job associate cost execution time deadline,user,relevant
1,user establish cost unit time and submit job,establish cost unit time and submit job,user,relevant
2,user monitor job submit status,monitor job submit status,user,relevant
3,user cancel job submit,cancel job submit,user,relevant
4,user check credit balance,check credit balance,user,relevant


In [13]:
requirement_relevancy_dataset.isna().sum()

reqs_statement    0
action_part       0
actor_part        0
label             0
dtype: int64

In [3]:
requirement_relevancy_dataset.describe()

Unnamed: 0,reqs_statement,action_part,actor_part,label
count,621,621,621,621
unique,621,621,326,2
top,user submit job associate cost execution time ...,submit job associate cost execution time deadline,system,relevant
freq,1,1,97,557


### Some graphs for the data

In [4]:
# Count the number of labels that are relevant and irrelevant
irrelavent_count = requirement_relevancy_dataset.loc[
    requirement_relevancy_dataset["label"] == "irrelevant"
]["label"].count()

relevant_count = requirement_relevancy_dataset.loc[
    requirement_relevancy_dataset["label"] == "relevant"
]["label"].count()

In [5]:
pd.DataFrame(
    {
        "label names": ["relevant", "irrelevant"],
        "count": [relevant_count, irrelavent_count],
    }
).iplot(
    kind="bar", title="Relevant vs Irrelevant Requirements", categories="label names"
)

In [6]:
cf.help("bar")

BAR
Bar Chart
Supports categories and horizontal bar charts


Parameters:
    bargap : float
        Sets the gap between bars
        	[0,1)
    bargroupgap : float
        Sets the gap between groups
        	[0,1)
    barmode : string
        Bar mode
        	group
        	stack
        	overlay
    categories : string
        Name of the column that contains the categories
    orientation : string
        Sets the orientation of the bars.
        	h
        	v
    sortbars : bool
        Sort bars in descending order


    colors : dict, list or string
        Trace color
        	string : applies to all traces
        	list : applies to each trace in the order specified
        	dict : {column:value} for each column in the dataframe
        values
        	colorname : see cufflinks.colors.cnames
        	hex : '#ffffff'
        	rgb : 'rgb(23,50,23)'
        	rgba : 'rgba(23,50,23,.5)
    colorscale : string
        Color scale name
        If the color is preceded by a minus (-

# PURE Dataset

In this section, we will explore the PURE dataset.


In [23]:
train_dataset = pd.read_csv("../../Datasets/pure/PURE_train.csv", engine="pyarrow")
test_dataset = pd.read_csv("../../Datasets/pure/PURE_test.csv", engine="pyarrow")
valid_dataset = pd.read_csv("../../Datasets/pure/PURE_valid.csv", engine="pyarrow")

In [24]:
train_dataset.head()

Unnamed: 0,Unnamed: 1,Requirement,Name of Doc,Req/Not Req
0,0,The solution should provide detailed context-s...,cctns.pdf,Req
1,1,The help should be accessible to the users bot...,cctns.pdf,Req
2,2,The solution should provide an interface for t...,cctns.pdf,Req
3,3,"The solution should send alerts (e.g., email, ...",cctns.pdf,Req
4,4,The solution should enable the user to track t...,cctns.pdf,Req


In [25]:
train_dataset["label"] = train_dataset["Req/Not Req"].map({"Req": 1, "Not_Req": 0})
train_dataset.head()

Unnamed: 0,Unnamed: 1,Requirement,Name of Doc,Req/Not Req,label
0,0,The solution should provide detailed context-s...,cctns.pdf,Req,1
1,1,The help should be accessible to the users bot...,cctns.pdf,Req,1
2,2,The solution should provide an interface for t...,cctns.pdf,Req,1
3,3,"The solution should send alerts (e.g., email, ...",cctns.pdf,Req,1
4,4,The solution should enable the user to track t...,cctns.pdf,Req,1


In [26]:
train_dataset["label"].value_counts()

label
1    2832
0    2474
Name: count, dtype: int64

In [27]:
test_dataset["label"] = test_dataset["Req/Not Req"].map({"Req": 1, "Not_Req": 0})
test_dataset.head()

Unnamed: 0,Unnamed: 1,Requirement,Name of Doc,Req/Not Req,label
0,347,System Initialization performs those functions...,nasa x38.doc,Req,1
1,348,"Whenever a power-on reset occurs, System Initi...",nasa x38.doc,Req,1
2,349,"As part of System Initialization , the Boot RO...",nasa x38.doc,Req,1
3,350,System Initialization shall [SRS014] initiate ...,nasa x38.doc,Req,1
4,351,System Initialization shall [SRS292] enable an...,nasa x38.doc,Req,1


In [28]:
test_dataset["label"].value_counts()

label
1    1058
0     476
Name: count, dtype: int64

In [29]:
valid_dataset["label"] = valid_dataset["Req/Not Req"].map({"Req": 1, "Not_Req": 0})
valid_dataset.head()

Unnamed: 0,Unnamed: 1,Requirement,Name of Doc,Req/Not Req,label
0,1269,Any operation requiring the user to supply a f...,hats.pdf,Req,1
1,1270,For any operation where the user is prompted t...,hats.pdf,Req,1
2,1271,When collecting generated output files from HA...,hats.pdf,Req,1
3,1272,"For example, given a transformation language p...",hats.pdf,Req,1
4,1273,If X.tlp.parsed existed prior to executing the...,hats.pdf,Req,1


In [30]:
valid_dataset["label"].value_counts()

label
0    650
1    255
Name: count, dtype: int64

In [31]:
train_dataset.to_csv("../../Datasets/pure/PURE_train.csv", index=False)
test_dataset.to_csv("../../Datasets/pure/PURE_test.csv", index=False)
valid_dataset.to_csv("../../Datasets/pure/PURE_valid.csv", index=False)