### Data

In [1]:
with open('html1', 'r') as f:
    html1 = f.read()
with open('html2', 'r') as f:
    html2 = f.read()
with open('html3', 'r') as f:
    html3 = f.read()
with open('html4', 'r') as f:
    html4 = f.read()
with open('html5', 'r') as f:
    html5 = f.read()

### Parse

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [3]:
data = []

topics = ['crm', 'email-marketing', 'help-desk', 'human-resources', 'project-management']
htmls = [html1, html2, html3, html4, html5]

for z, html in enumerate(htmls):
    topic = topics[z]
    soup = BeautifulSoup(html, 'html.parser')
    tr_elements = soup.find_all('tr', attrs={'role': 'row'})

    for i in tr_elements[2:]:
        soup = BeautifulSoup(str(i), 'html.parser')

        if soup.find_all('td', attrs={'class': "text-center"})[0].text[0] not in ["$", "€"]:
            continue
        price = float(soup.find_all('td', attrs={'class': "text-center"})[0].text[1:].replace(',', ''))
        if soup.find_all('td', attrs={'class': "text-center"})[0].text[0] == "€":
            price *= 1.09
        try:
            name = soup.find('div', attrs={'class': "product-name"}).a.text
            rating_text = soup.find('div', attrs={'class': "reviews"}).get("data-rating").split(' ')
            rating = rating_text[0].split('/')[0]
            rating_count = rating_text[2]
            freq = soup.find_all('td', attrs={'class': "text-center"})[1].text
            checks = soup.find_all('span', attrs={'class': "icon-circle"})
            per_user = checks[0].get('class')[1] == "check"
            free_version = checks[1].get('class')[1] == "check"    
            free_trial = checks[2].get('class')[1] == "check"    
            comment_count = soup.find('a', attrs={"class": "reviews-count milli"}).text[1:-1]
        except:
            continue
        data.append({
            'name': name,
            'price': price,
            'rating': float(rating),
            'comment_count': int(comment_count),
            'frequency': freq,
            'per_user': int(per_user),
            'free_version': int(free_version),
            'free_trial': int(free_trial),
            'category': topic
        })

In [4]:
df = pd.DataFrame(data)

In [5]:
df = df.drop_duplicates(['name'])

In [6]:
df

Unnamed: 0,name,price,rating,comment_count,frequency,per_user,free_version,free_trial,category
0,17hats,15.000,4.4,116,Per-Month,0,1,1,crm
1,1CRM,15.000,4.3,58,Per-Month,1,1,1,crm
2,20NINE,3.161,4.9,32,Per-Month,1,0,1,crm
3,3GBusiness,1999.000,4.0,1,Per-Month,1,0,0,crm
4,8am,18.000,0.0,0,Per-Month,0,0,0,crm
...,...,...,...,...,...,...,...,...,...
2392,ZenTao,39.900,4.0,3,Per-Year,0,1,1,project-management
2393,ZEP,1.635,4.8,4,Per-Month,1,1,1,project-management
2394,ZilicusPM,25.000,4.6,32,Per-Month,1,0,1,project-management
2395,ZingProject,1.000,0.0,0,Per-Month,0,1,0,project-management


### Define Total Price (10 years)

In [7]:
df['total_price'] = df['price']

In [8]:
df.loc[df['frequency'] == 'Per-Month', 'total_price'] *= 120

In [9]:
df.loc[df['frequency'] == 'Per-Year', 'total_price'] *= 10

In [10]:
df = df.reset_index(drop=True)

In [11]:
df['buyability'] = (df['rating'] - np.mean(df['rating'])) * (df['comment_count'] - np.mean(df['comment_count']))

In [12]:
sorted_df = df.sort_values(by='buyability', ascending=False)
sorted_df = sorted_df.reset_index(drop=True)

In [13]:
sorted_df['decision'] = 0
sorted_df.loc[:50, 'decision'] = 1

In [14]:
sorted_df

Unnamed: 0,name,price,rating,comment_count,frequency,per_user,free_version,free_trial,category,total_price,buyability,decision
0,QuickBooks Time,20.0,4.7,6483,Per-Month,0,0,1,human-resources,2400.0,11135.311699,1
1,HubSpot Marketing Hub,50.0,4.5,5622,Per-Month,0,1,1,email-marketing,6000.0,8530.606469,1
2,Dynamics 365,50.0,4.4,5177,Per-Month,1,0,1,crm,6000.0,7336.661924,1
3,Zendesk Suite,49.0,4.4,3682,Per-Month,1,0,1,crm,5880.0,5198.097452,1
4,Rippling,8.0,4.9,2728,Per-Month,1,0,0,human-resources,960.0,5173.330905,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2213,DesignSmart,52.0,5.0,1,Per-Month,0,0,1,project-management,6240.0,-95.800474,0
2214,FForce,10.0,5.0,1,Per-Month,0,0,0,crm,1200.0,-95.800474,0
2215,Ensight eMarketing Suite,650.0,5.0,1,Per-Month,1,0,0,email-marketing,78000.0,-95.800474,0
2216,Firmao CRM,19.0,5.0,1,Per-Month,1,1,1,crm,2280.0,-95.800474,0


In [15]:
len(sorted_df[sorted_df['decision'] == 1])

51

In [16]:
df = sorted_df.copy()
shuffled_df = df.sample(frac=1, random_state=42)

# Reset the index of the shuffled DataFrame
df = shuffled_df.reset_index(drop=True)

In [17]:
df

Unnamed: 0,name,price,rating,comment_count,frequency,per_user,free_version,free_trial,category,total_price,buyability,decision
0,Kin HR,3.000,4.6,14,Per-Month,0,0,1,human-resources,360.00,-55.731764,0
1,WizzTime,9.990,0.0,0,Per-Month,0,1,1,project-management,1198.80,143.075270,0
2,Tempo Timesheets,10.000,4.3,193,Per-Month,1,0,1,help-desk,1200.00,192.678155,0
3,LeadsLive,25.000,0.0,0,Per-Month,0,0,1,crm,3000.00,143.075270,0
4,Averroes,6.990,5.0,2,Per-Month,1,1,1,project-management,838.80,-93.769996,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2213,myofficehub,3.000,4.4,5,Per-Month,0,1,1,human-resources,360.00,-61.769816,0
2214,Conrep,40.000,4.5,45,Per-Month,1,0,0,human-resources,4800.00,-4.868824,0
2215,Interstis,16.350,3.7,27,Per-Month,0,1,1,project-management,1962.00,-15.472431,0
2216,ezeeCRM,12.000,4.1,13,Per-Month,0,0,1,crm,1440.00,-39.771620,0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2218 entries, 0 to 2217
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           2218 non-null   object 
 1   price          2218 non-null   float64
 2   rating         2218 non-null   float64
 3   comment_count  2218 non-null   int64  
 4   frequency      2218 non-null   object 
 5   per_user       2218 non-null   int64  
 6   free_version   2218 non-null   int64  
 7   free_trial     2218 non-null   int64  
 8   category       2218 non-null   object 
 9   total_price    2218 non-null   float64
 10  buyability     2218 non-null   float64
 11  decision       2218 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 208.1+ KB


In [19]:
df[df['frequency'] == 'One-Time'].describe()

Unnamed: 0,price,rating,comment_count,per_user,free_version,free_trial,total_price,buyability,decision
count,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0,187.0
mean,3228.400968,1.880749,7.208556,0.481283,0.192513,0.572193,3228.400968,61.995273,0.0
std,10561.375828,2.193675,28.032514,0.500991,0.395332,0.496089,10561.375828,100.932846,0.0
min,0.01,0.0,0.0,0.0,0.0,0.0,0.01,-95.800474,0.0
25%,77.0,0.0,0.0,0.0,0.0,0.0,77.0,-48.61923,0.0
50%,290.0,0.0,0.0,0.0,0.0,1.0,290.0,143.07527,0.0
75%,1196.775,4.3,3.0,1.0,0.0,1.0,1196.775,143.07527,0.0
max,100000.0,5.0,286.0,1.0,1.0,1.0,100000.0,340.194476,0.0


In [20]:
df[df['frequency'] == 'Per-Month'].describe()

Unnamed: 0,price,rating,comment_count,per_user,free_version,free_trial,total_price,buyability,decision
count,1884.0,1884.0,1884.0,1884.0,1884.0,1884.0,1884.0,1884.0,1884.0
mean,72.920379,3.075796,54.245223,0.477176,0.337049,0.789278,8750.445535,78.096876,0.026008
std,222.020164,2.169914,308.066748,0.499611,0.472827,0.40793,26642.419682,486.558948,0.159203
min,0.01,0.0,0.0,0.0,0.0,0.0,1.2,-95.800474,0.0
25%,9.0,0.0,0.0,0.0,0.0,1.0,1080.0,-66.228968,0.0
50%,20.71,4.4,3.0,0.0,0.0,1.0,2485.2,-16.1007,0.0
75%,50.0,4.8,17.0,1.0,1.0,1.0,6000.0,143.07527,0.0
max,5000.0,5.0,6483.0,1.0,1.0,1.0,600000.0,11135.311699,1.0


In [21]:
df[df['frequency'] == 'Per-Year'].describe()

Unnamed: 0,price,rating,comment_count,per_user,free_version,free_trial,total_price,buyability,decision
count,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0
mean,1675.460784,2.992517,22.585034,0.428571,0.292517,0.680272,16754.607837,35.406271,0.013605
std,4566.316535,2.166434,72.53878,0.496564,0.456474,0.467965,45663.165348,139.689408,0.116242
min,0.01,0.0,0.0,0.0,0.0,0.0,0.1,-95.800474,0.0
25%,28.0705,0.0,0.0,0.0,0.0,0.0,280.705,-62.947499,0.0
50%,108.9891,4.4,2.0,0.0,0.0,1.0,1089.891,-24.49813,0.0
75%,774.5,4.7,15.0,1.0,1.0,1.0,7745.0,143.07527,0.0
max,30000.0,5.0,647.0,1.0,1.0,1.0,300000.0,1036.242628,1.0


### EDA

In [22]:
!pip install sweetviz



In [23]:
import sweetviz

In [24]:
df

Unnamed: 0,name,price,rating,comment_count,frequency,per_user,free_version,free_trial,category,total_price,buyability,decision
0,Kin HR,3.000,4.6,14,Per-Month,0,0,1,human-resources,360.00,-55.731764,0
1,WizzTime,9.990,0.0,0,Per-Month,0,1,1,project-management,1198.80,143.075270,0
2,Tempo Timesheets,10.000,4.3,193,Per-Month,1,0,1,help-desk,1200.00,192.678155,0
3,LeadsLive,25.000,0.0,0,Per-Month,0,0,1,crm,3000.00,143.075270,0
4,Averroes,6.990,5.0,2,Per-Month,1,1,1,project-management,838.80,-93.769996,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2213,myofficehub,3.000,4.4,5,Per-Month,0,1,1,human-resources,360.00,-61.769816,0
2214,Conrep,40.000,4.5,45,Per-Month,1,0,0,human-resources,4800.00,-4.868824,0
2215,Interstis,16.350,3.7,27,Per-Month,0,1,1,project-management,1962.00,-15.472431,0
2216,ezeeCRM,12.000,4.1,13,Per-Month,0,0,1,crm,1440.00,-39.771620,0


In [25]:
rep = sweetviz.analyze(df)

  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in to_process.source_counts["value_counts_without_nan"].iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():


In [26]:
rep.show_html()

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [27]:
# df.to_csv('with_total_price.csv')

### Model

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [29]:
# Split the data into features (X) and target variable (y)
# X = df.drop(['decision', 'category', 'frequency', 'name', 'buyability'], axis=1)
# y = df['decision']
X = pd.read_csv('X.csv')
y = pd.read_csv('Y.csv')


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
y

0       0
1       0
2       0
3       0
4       0
       ..
2213    0
2214    0
2215    0
2216    0
2217    0
Name: decision, Length: 2218, dtype: int64

In [31]:
# Initialize the models
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

# Train each model
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Make predictions on the test set
decision_tree_pred = decision_tree.predict(X_test)
random_forest_pred = random_forest.predict(X_test)

# Evaluate model performance
decision_tree_accuracy = accuracy_score(y_test, decision_tree_pred)
random_forest_accuracy = accuracy_score(y_test, random_forest_pred)

decision_tree_precision = precision_score(y_test, decision_tree_pred)
random_forest_precision = precision_score(y_test, random_forest_pred)

decision_tree_recall = recall_score(y_test, decision_tree_pred)
random_forest_recall = recall_score(y_test, random_forest_pred)

decision_tree_f1 = f1_score(y_test, decision_tree_pred)
random_forest_f1 = f1_score(y_test, random_forest_pred)

# Print the accuracy of each model
print("Decision Tree Accuracy:", decision_tree_accuracy)
print("Random Forest Accuracy:", random_forest_accuracy)
print("Decision Tree Precision:", decision_tree_precision)
print("Random Forest Precision:", random_forest_precision)
print("Decision Tree Recall:", decision_tree_recall)
print("Random Forest Recall:", random_forest_recall)
print("Decision Tree F1:", decision_tree_f1)
print("Random Forest F1:", random_forest_f1)

Decision Tree Accuracy: 0.9954954954954955
Random Forest Accuracy: 0.9977477477477478
Decision Tree Precision: 0.8571428571428571
Random Forest Precision: 1.0
Decision Tree Recall: 0.8571428571428571
Random Forest Recall: 0.8571428571428571
Decision Tree F1: 0.8571428571428571
Random Forest F1: 0.923076923076923


In [32]:
X

Unnamed: 0,price,rating,comment_count,per_user,free_version,free_trial,total_price
0,3.000,4.6,14,0,0,1,360.00
1,9.990,0.0,0,0,1,1,1198.80
2,10.000,4.3,193,1,0,1,1200.00
3,25.000,0.0,0,0,0,1,3000.00
4,6.990,5.0,2,1,1,1,838.80
...,...,...,...,...,...,...,...
2213,3.000,4.4,5,0,1,1,360.00
2214,40.000,4.5,45,1,0,0,4800.00
2215,16.350,3.7,27,0,1,1,1962.00
2216,12.000,4.1,13,0,0,1,1440.00


In [33]:
random_forest.predict(X[:4])

array([0, 0, 0, 0])

In [34]:
X.to_csv('X.csv')
y.to_csv('Y.csv')