## Importing libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options

In [None]:
chrome_options = Options()
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("--start-fullscreen");

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## Opening Website using chrome

In [None]:
#make sure the chromedriver is upto date
path = "chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get("https://marketplace.atlassian.com")
driver.maximize_window()

action = ActionChains(driver)

## Extracting a list of all categories

In [None]:
list_of_categories = driver.find_elements_by_class_name('ReactCollapse--content')
if len(list_of_categories[-1].text) < 60:
       c = 0
else:
       c = -1

In [None]:
cat_list=list_of_categories[c].text.split('\n')
cat_list

## Navigating throught each category

In [None]:
#extracting the links of all apps with their categories
listOfLinks = []
category_lst = []
for cat in cat_list: 
    element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.LINK_TEXT, cat)))
    action.move_to_element(element)
    action.perform()
    element.click()
    while True:
        try:
            #loop until you no longer see the "More result button"
            driver.implicitly_wait(2)
            next_button = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-doguem')))
            action.move_to_element(next_button)
            action.perform()
            driver.implicitly_wait(2)
            next_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'css-doguem')))
            next_button.click()
        except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
            element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.e13wqmfi2.css-1uhn52o-linkCss-linkCss-HitLinkStyled-HitLinkStyled.e11ijshh0')))
            app_container = driver.find_elements_by_css_selector('a.e13wqmfi2.css-1uhn52o-linkCss-linkCss-HitLinkStyled-HitLinkStyled.e11ijshh0')
            #print(len(app_container))
            for link in app_container:
                listOfLinks.append(link.get_property('href'))
            
            driver.implicitly_wait(2)
            element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "css-1hj6rt6-CategoriesStyled.ecweo2c0")))
            categ = driver.find_elements_by_class_name("css-1hj6rt6-CategoriesStyled.ecweo2c0")
            for cat in categ:
                category_lst.append(cat.text)
            
            break
    driver.back()

In [None]:
df = pd.DataFrame()
df['Link'] = listOfLinks
df['Category'] = category_lst
df

In [None]:
#deleting duplicates links because a single app can belong to multiple categories
df = df.drop_duplicates(subset=['Link'])
df

## Extracting the app info

In [None]:
all_details = []
i=0
for page in tqdm(df.Link):
    driver = webdriver.Chrome(path)
    driver.get(page)
    driver.maximize_window()
    driver.implicitly_wait(6)
    try:
        App_name = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "css-tz84ge-AppName-baseHeadingsCss-baseHeadingsCss.e1x0fti20")))
        App_name = App_name.text
    except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
        App_name = np.NaN
    
    try:
        Description = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "css-29syww-H2-baseHeadingsCss-baseHeadingsCss.e1lut4zs2")))
        Description = Description.text   
    except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
        Description = np.NaN

    try:
        Num_of_reviews = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "css-1lz6ltt-RatingsCount.e5bskoh1")))
        Num_of_reviews = Num_of_reviews.text
    except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
        Num_of_reviews = np.NaN
    
    try:    
        Num_of_installs = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "css-1y9uv3s-InstallCountText.e5w14hq2")))
        Num_of_installs = Num_of_installs.text
    except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
        Num_of_installs = np.NaN
    
    try:
        element = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="app-listing-tabs-2"]')))
        button =driver.find_element_by_xpath('//*[@id="app-listing-tabs-2"]')
        button_text =button.text
        if button_text == "Pricing":
            button.click()
            driver.implicitly_wait(6)
            try:
                element = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-yfdo61-AppCostContent.eu58ut84')))
                price =  driver.find_element_by_class_name('css-yfdo61-AppCostContent.eu58ut84').text
            except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
                element = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'css-3fpr2v-PricingTierListItemContent.e1hpvo5f6')))
                price =  driver.find_elements_by_class_name('css-3fpr2v-PricingTierListItemContent.e1hpvo5f6')[0].text
        else:
            price = "Free"
    except (TimeoutException, NoSuchElementException, StaleElementReferenceException) as e:
        price = np.NaN
    

    info = {'App_name':App_name ,
            'App_Description': Description,
            'Num_of_reviews':Num_of_reviews ,
            'Num_of_installs':Num_of_installs,
            'Price': price,
            'Category': df.Category[i]
            'App_link':page
            
            }
    i=i+1
    all_details.append(info)

In [None]:
all_details

## Converting the info to a dataframe

In [None]:
data = pd.DataFrame(all_details)
data

## Cleaning the data

In [None]:
data.columns

In [None]:
#ordering by link
data = data.sort_values(by=['App_link'])

In [None]:
data.to_csv('all apps.csv', index = False)

In [None]:
#removing all missing values
data = data.dropna(how='any',axis=0).reset_index(drop=True)

In [None]:
#cleaning the num of installs columns
data['Num_of_installs'] = data.Num_of_installs.str.replace('installs' , '')
data['Num_of_installs'] = data.Num_of_installs.str.replace('install' , '')
data['Num_of_installs'] = data.Num_of_installs.str.replace('downloads' , '')
data['Num_of_installs'] = data.Num_of_installs.str.replace(',' , '')
data['Num_of_installs'] = data.Num_of_installs.str.replace('New!' , '0')
data['Num_of_installs'] = data.Num_of_installs.str.replace('k' , '000')
data['Num_of_installs'] = data.Num_of_installs.str.replace('Preinstalled' , '0')
data['Num_of_installs'] = data.Num_of_installs.str.replace('Preed' , '0')
data['Num_of_installs'] = data.Num_of_installs.str.strip()
data['Num_of_installs'] = data.Num_of_installs.astype(int)
data

In [None]:
#cleaning the num of reviews columns
data['Num_of_reviews'] = data.Num_of_reviews.str.replace('k' , '000')
data['Num_of_reviews'] = data.Num_of_reviews.astype(int)

In [None]:
#cleaning the price columns
data['Price'] = data.Price.str.replace('\n',' ')

## Saving dataframe to csv file

In [None]:
data.to_csv('all apps - cleaned.csv', index = False)

## Extracting Time tracking applications

In [None]:
time_tracking = data[data['Category'].str.contains('Time tracking', regex=False)]
time_tracking = time_tracking.reset_index(drop=True)

In [None]:
time_tracking.columns

In [None]:
import matplotlib.pyplot as plt
plt.plot(time_tracking['Num_of_installs'])

In [None]:
plt.boxplot(time_tracking['Num_of_installs'].values)

In [None]:
x = time_tracking[time_tracking['Num_of_installs'] < 175]

In [None]:
plt.boxplot(x.Num_of_installs)

In [None]:
plt.hist(x.Num_of_installs,bins = 10,histtype='step')
plt.show()