In [1]:
import pandas as pd

In [2]:
file_path = "./temp/data.pkl"
data = pd.read_pickle(file_path)

## EDA

In [3]:
data.head()

Unnamed: 0,softwareType,industryDomain,numUsers,targetMarket,adminDashboard,contentManagement,extraFeatures,thirdPartyService,authentication,dataMigration,uiUxDesign,performance,security,availability,timeline_months,Price
0,Mobile,Restaurant_Management,500-1000,Global,Basic,Workflow,Reporting_and_Analysis,"Analytics, Payment_Gateway, Mail",Multi_Factor,Null,Custom,Medium,High,Normal,9,1300.0
1,Desktop,Fintech,30-50,Local,Null,Workflow,Reporting_and_Analysis,"Payment_Gateway, Mail",Social,Null,Advanced,High,Null,Always,3,1050.0
2,Web,Fintech,01/10,Local,Basic,Pages_and_Media,Search_and_Filter,Mail,Multi_Factor,Null,Basic,Basic,Null,Normal,30,310.0
3,Mobile,Hotel_Management,100-500,Both,Advanced,Workflow,Search_and_Filter,"Mail, Payment_Gateway",Social,Null,Custom,Basic,Null,Normal,9,750.0
4,Desktop,Ecommerce,01/10,Both,Basic,Null,File_Handling,"AI_integration, Payment_Gateway",Null,No,Custom,Basic,Null,Normal,4,920.0


### Number of Users
As we can see from the head of the data, some fields in `numUsers` column has a little string
mismatch having `/` instead of `-`. We need to replace it.

In [4]:
data["numUsers"] = data["numUsers"].str.replace("/", "-", regex=False)

In [5]:
data.head()

Unnamed: 0,softwareType,industryDomain,numUsers,targetMarket,adminDashboard,contentManagement,extraFeatures,thirdPartyService,authentication,dataMigration,uiUxDesign,performance,security,availability,timeline_months,Price
0,Mobile,Restaurant_Management,500-1000,Global,Basic,Workflow,Reporting_and_Analysis,"Analytics, Payment_Gateway, Mail",Multi_Factor,Null,Custom,Medium,High,Normal,9,1300.0
1,Desktop,Fintech,30-50,Local,Null,Workflow,Reporting_and_Analysis,"Payment_Gateway, Mail",Social,Null,Advanced,High,Null,Always,3,1050.0
2,Web,Fintech,01-10,Local,Basic,Pages_and_Media,Search_and_Filter,Mail,Multi_Factor,Null,Basic,Basic,Null,Normal,30,310.0
3,Mobile,Hotel_Management,100-500,Both,Advanced,Workflow,Search_and_Filter,"Mail, Payment_Gateway",Social,Null,Custom,Basic,Null,Normal,9,750.0
4,Desktop,Ecommerce,01-10,Both,Basic,Null,File_Handling,"AI_integration, Payment_Gateway",Null,No,Custom,Basic,Null,Normal,4,920.0


### String Processing

As we can see another issue here, some fields contain multiple *features* connected with `_and_`. We need to render the data as list of features. For that, we will split the data and make a list for it.

In [6]:
# Ensure it's a string column first
data["contentManagement"] = data["contentManagement"].astype(str)

In [7]:
# Split on "_and_" → produces lists
data["contentManagement"] = data["contentManagement"].str.split("_and_")

In [8]:
# Let's see now
data.head(20)

Unnamed: 0,softwareType,industryDomain,numUsers,targetMarket,adminDashboard,contentManagement,extraFeatures,thirdPartyService,authentication,dataMigration,uiUxDesign,performance,security,availability,timeline_months,Price
0,Mobile,Restaurant_Management,500-1000,Global,Basic,[Workflow],Reporting_and_Analysis,"Analytics, Payment_Gateway, Mail",Multi_Factor,Null,Custom,Medium,High,Normal,9,1300.0
1,Desktop,Fintech,30-50,Local,Null,[Workflow],Reporting_and_Analysis,"Payment_Gateway, Mail",Social,Null,Advanced,High,Null,Always,3,1050.0
2,Web,Fintech,01-10,Local,Basic,"[Pages, Media]",Search_and_Filter,Mail,Multi_Factor,Null,Basic,Basic,Null,Normal,30,310.0
3,Mobile,Hotel_Management,100-500,Both,Advanced,[Workflow],Search_and_Filter,"Mail, Payment_Gateway",Social,Null,Custom,Basic,Null,Normal,9,750.0
4,Desktop,Ecommerce,01-10,Both,Basic,[Null],File_Handling,"AI_integration, Payment_Gateway",Null,No,Custom,Basic,Null,Normal,4,920.0
5,Hybrid,Edtech,100-500,Global,Professional,[Null],Null,"Mail, Analytics",Social,Null,Custom,Basic,Null,Normal,36,1630.0
6,Hybrid,Content_Management,30-50,Both,Professional,[Blog],AI_ML_Module,Map,Multi_Factor,Yes,Advanced,Medium,Null,Normal,9,1960.0
7,Mobile,Travel,30-50,Both,Null,[Blog],Offile_Mode,Map,Social,Yes,Basic,High,Standard,Always,12,1285.0
8,Hybrid,Content_Management,10-30,Local,Null,[Blog],Null,"Payment_Gateway, Analytics",Null,No,Custom,Basic,High,Normal,24,1100.0
9,Desktop,Fintech,500-1000,Global,Null,[Workflow],Reporting_and_Analysis,Null,Null,No,Advanced,High,Standard,Normal,6,980.0


Perfect for the `contentManagement` column. Now let's do the same for `extraFeatures`

In [9]:
# Ensure it's a string column first
data["extraFeatures"] = data["extraFeatures"].astype(str)

# Split on "_and_" → produces lists
data["extraFeatures"] = data["extraFeatures"].str.split("_and_")


data.head(15)

Looks like working good. Now we have to process the `thirdPartyService` column the same way. But this time, we split by `,`

In [10]:
# Ensure it's a string column first
data["thirdPartyService"] = data["thirdPartyService"].astype(str)

# Split on "_and_" → produces lists
data["thirdPartyService"] = (
    data["thirdPartyService"]
    .astype(str)  # ensure strings
    .str.split(",")  # split into lists
    .apply(lambda lst: [x.strip() for x in lst])  # trim spaces
)

In [11]:
data.head(10)

Unnamed: 0,softwareType,industryDomain,numUsers,targetMarket,adminDashboard,contentManagement,extraFeatures,thirdPartyService,authentication,dataMigration,uiUxDesign,performance,security,availability,timeline_months,Price
0,Mobile,Restaurant_Management,500-1000,Global,Basic,[Workflow],"[Reporting, Analysis]","[Analytics, Payment_Gateway, Mail]",Multi_Factor,Null,Custom,Medium,High,Normal,9,1300.0
1,Desktop,Fintech,30-50,Local,Null,[Workflow],"[Reporting, Analysis]","[Payment_Gateway, Mail]",Social,Null,Advanced,High,Null,Always,3,1050.0
2,Web,Fintech,01-10,Local,Basic,"[Pages, Media]","[Search, Filter]",[Mail],Multi_Factor,Null,Basic,Basic,Null,Normal,30,310.0
3,Mobile,Hotel_Management,100-500,Both,Advanced,[Workflow],"[Search, Filter]","[Mail, Payment_Gateway]",Social,Null,Custom,Basic,Null,Normal,9,750.0
4,Desktop,Ecommerce,01-10,Both,Basic,[Null],[File_Handling],"[AI_integration, Payment_Gateway]",Null,No,Custom,Basic,Null,Normal,4,920.0
5,Hybrid,Edtech,100-500,Global,Professional,[Null],[Null],"[Mail, Analytics]",Social,Null,Custom,Basic,Null,Normal,36,1630.0
6,Hybrid,Content_Management,30-50,Both,Professional,[Blog],[AI_ML_Module],[Map],Multi_Factor,Yes,Advanced,Medium,Null,Normal,9,1960.0
7,Mobile,Travel,30-50,Both,Null,[Blog],[Offile_Mode],[Map],Social,Yes,Basic,High,Standard,Always,12,1285.0
8,Hybrid,Content_Management,10-30,Local,Null,[Blog],[Null],"[Payment_Gateway, Analytics]",Null,No,Custom,Basic,High,Normal,24,1100.0
9,Desktop,Fintech,500-1000,Global,Null,[Workflow],"[Reporting, Analysis]",[Null],Null,No,Advanced,High,Standard,Normal,6,980.0


## Saving the Data

Looks like the data has been cleaned. Now we need to save it as `Pickle` format so that it can be used for 
data preprocessing next.

In [12]:
pkl_file_path = "./temp/data_clean.pkl"
data.to_pickle(pkl_file_path)