
# Semantic Search
## Part I - Data Collection
Query the wikipedia API and collect all of the articles under the following wikipedia categories:
## Machine Learning
## Business Software
    
    
The code should be modular enough that any valid category from Wikipedia can be queried by the code.
The results of the query should be written to PostgreSQL tables, page and category. Build some sort of reference between the pages and categories. (Keep in mind that a page can have many categories and a category can have many pages so a straight foreign key arrangement will not work.)

In [1]:
#collecting wikipedia data
!pip install wikipedia



In [2]:
!pip install psycopg2




In [3]:
import re
import requests
import pandas as pd
import wikipedia
import psycopg2 as pg2
from psycopg2.extras import RealDictCursor
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

#import the essential tools for lsa
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

from sqlalchemy import create_engine

In [4]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category


In [5]:
#to get the categories and category types

def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = """
            http://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}&
            cmprop=ids%7Ctitle%7Ctype&
            cmlimit=max
            """.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [6]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

In [7]:
#to get the category id for machine learning 
ML_category = wikipedia.page('Category:Machine learning')

In [8]:
ML_page = wikipedia.page('Machine learning')


In [9]:
print(ML_category.pageid)
print(ML_page.pageid)

706543
233488


In [10]:
ML_query = generate_query("machine learning")
ML_query

'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:machine+learning&cmprop=ids%7Ctitle%7Ctype&cmlimit=max'

In [11]:
wikipedia.page("Business_Software").pageid


'1037763'

In [12]:
wikipedia.page("Category:Business_Software").pageid

'1744470'

In [13]:
ML_df = execute_category_query("machine learning")
ML_df.head()

Unnamed: 0,ns,pageid,title,type
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...,page
1,0,43385931,Data exploration,page
2,0,49082762,List of datasets for machine-learning research,page
3,0,233488,Machine learning,page
4,0,53587467,Outline of machine learning,page


In [14]:
#mask where title has categories in it.
category_mask = ML_df["title"].str.contains("Category:")

In [15]:
ML_df[category_mask].head()


Unnamed: 0,ns,pageid,title,type
200,14,33547387,Category:Applied machine learning,subcat
201,14,42936114,Category:Artificial neural networks,subcat
202,14,1718975,Category:Bayesian networks,subcat
203,14,1991254,Category:Classification algorithms,subcat
204,14,22532673,Category:Cluster analysis,subcat


In [16]:
ML_category = ML_df[category_mask]

In [17]:
ML_category.shape

(30, 4)

In [18]:
ML_category["category"] = "Machine_Learning"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
ML_category.head()

Unnamed: 0,ns,pageid,title,type,category
200,14,33547387,Category:Applied machine learning,subcat,Machine_Learning
201,14,42936114,Category:Artificial neural networks,subcat,Machine_Learning
202,14,1718975,Category:Bayesian networks,subcat,Machine_Learning
203,14,1991254,Category:Classification algorithms,subcat,Machine_Learning
204,14,22532673,Category:Cluster analysis,subcat,Machine_Learning


In [21]:
ML_category = ML_category[["title","category"]]

In [22]:
ML_category.head()


Unnamed: 0,title,category
200,Category:Applied machine learning,Machine_Learning
201,Category:Artificial neural networks,Machine_Learning
202,Category:Bayesian networks,Machine_Learning
203,Category:Classification algorithms,Machine_Learning
204,Category:Cluster analysis,Machine_Learning


In [23]:
def remove_category(category):
    category = re.sub('Category:','',category)
    return category

In [24]:
ML_df[category_mask].head()["title"].apply(remove_category).tolist()


['Applied machine learning',
 'Artificial neural networks',
 'Bayesian networks',
 'Classification algorithms',
 'Cluster analysis']

In [25]:
subcat_df_list_ML = []


In [26]:
categories_to_query = ML_df[category_mask]['title'].apply(remove_category).tolist()


In [27]:
categories_to_query

['Applied machine learning',
 'Artificial neural networks',
 'Bayesian networks',
 'Classification algorithms',
 'Cluster analysis',
 'Computational learning theory',
 'Artificial intelligence conferences',
 'Data mining and machine learning software',
 'Datasets in machine learning',
 'Dimension reduction',
 'Ensemble learning',
 'Evolutionary algorithms',
 'Genetic programming',
 'Inductive logic programming',
 'Kernel methods for machine learning',
 'Latent variable models',
 'Learning in computer vision',
 'Log-linear models',
 'Loss functions',
 'Machine learning algorithms',
 'Machine learning portal',
 'Machine learning task',
 'Markov models',
 'Machine learning researchers',
 'Semisupervised learning',
 'Statistical natural language processing',
 'Structured prediction',
 'Supervised learning',
 'Support vector machines',
 'Unsupervised learning']

In [28]:
for category in categories_to_query:
    subcat_df_list_ML.append(execute_category_query(category))

In [29]:
subcat_df_list_ML[:5]


[    ns    pageid                                   title  type
 0    0  15795950                    Activity recognition  page
 1    0  41916168                              AlchemyAPI  page
 2    0  55075082                                   BigDL  page
 3    0  53631046                        Caffe (software)  page
 4    0  49119569    Comparison of deep learning software  page
 5    0  41916447                                 Cortica  page
 6    0  34529351                      DARPA LAGR Program  page
 7    0  43169442                          Deeplearning4j  page
 8    0  38818825                                 Diffbot  page
 9    0  41184517                            Google Brain  page
 10   0  46222904                         Intel RealSense  page
 11   0  35456221                                 IRCF360  page
 12   0    705605                             Jabberwacky  page
 13   0  31663650                                  Kaggle  page
 14   0  51650259                       

In [30]:
len(subcat_df_list_ML)

30

In [31]:
def get_all_pages_rec(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)
    categories = category_df[category_mask]['title']\
                            .str.replace('Category:','').tolist()
    if len(categories) > 0:
        for cat in categories:
            pages_list.append(get_all_pages_rec(cat))
    
    pages_df = pd.concat(pages_list)
    pages_df.reset_index()
    return pages_df

In [32]:
rec_ML_df = get_all_pages_rec("machine learning")
rec_ML_df.reset_index(drop =True).shape

(1612, 4)

In [33]:
rec_ML_df.sample(5)


Unnamed: 0,ns,pageid,title,type
64,0,7309022,Nearest neighbor search,page
2,0,1145683,Raymond Cattell,page
12,0,4605351,Latent Dirichlet allocation,page
138,0,42129549,OpenNN,page
69,0,1368665,Bernhard Schölkopf,page


In [34]:
rec_ML_df.reset_index(drop =True).drop_duplicates()[:5]


Unnamed: 0,ns,pageid,title,type
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...,page
1,0,43385931,Data exploration,page
2,0,49082762,List of datasets for machine-learning research,page
3,0,233488,Machine learning,page
4,0,53587467,Outline of machine learning,page


In [35]:
ML_df[~category_mask].head()


Unnamed: 0,ns,pageid,title,type
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...,page
1,0,43385931,Data exploration,page
2,0,49082762,List of datasets for machine-learning research,page
3,0,233488,Machine learning,page
4,0,53587467,Outline of machine learning,page


In [36]:
def get_whole_category(category):
    df = get_all_pages_rec(category)
    df = df.drop_duplicates().reset_index(drop=True)
    df['category'] = category
    return df

In [37]:
gwc_ML_df = get_whole_category("machine learning")


In [38]:
gwc_ML_df.shape


(1107, 5)

In [39]:
gwc_ML_df.head()


Unnamed: 0,ns,pageid,title,type,category
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...,page,machine learning
1,0,43385931,Data exploration,page,machine learning
2,0,49082762,List of datasets for machine-learning research,page,machine learning
3,0,233488,Machine learning,page,machine learning
4,0,53587467,Outline of machine learning,page,machine learning


In [40]:
content_ML = []


In [41]:
#takes so long
not_found = []

for pageid in gwc_ML_df["pageid"]:
    try:
        page =  wikipedia.WikipediaPage(pageid = pageid)
        content = page.content
        title = page.title
        content_ML.append((pageid, content, title))
        #print(content_ml)
    except:
        not_found.append(pageid)

In [42]:
len(content_ML)
#why the length is so long?



1102

In [43]:
content_ML_df = pd.DataFrame(content_ML, columns= ("pageid","content","title"))


In [44]:
content_ML_df["category"] = "Machine_Learning"
content_ML_df.head()

Unnamed: 0,pageid,content,title,category
0,54972729,\n= Customer Intelligence Management =\n\n\n==...,User:CustIntelMngt/sandbox/Customer Intelligen...,Machine_Learning
1,43385931,Data exploration is an approach similar to ini...,Data exploration,Machine_Learning
2,49082762,These datasets are used for machine-learning r...,List of datasets for machine-learning research,Machine_Learning
3,233488,Machine learning is the subfield of computer s...,Machine learning,Machine_Learning
4,53587467,The following outline is provided as an overvi...,Outline of machine learning,Machine_Learning


In [45]:
content_ML_df.shape

(1102, 4)

In [46]:
gwc_ML_df.shape


(1107, 5)

In [47]:
#writing the dataframe into a csv file, which makes a .csv in jupyter notebook
#make sure that the path you are using in sql is the same in your notebook

content_ML_df.to_csv("machine_learning.csv", index= False)

In [48]:
!pwd

/home/jovyan/ipynb


In [49]:
!ls

machine_learning.csv	postgres-jupyter .ipynb
Part-1-project-4.ipynb	wiki_query_business_software.ipynb
part-2-project-4.ipynb	wiki-query_machine_learning .ipynb
path


## Business software¶

In [50]:
BS_query = generate_query("business software")
BS_query

'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:business+software&cmprop=ids%7Ctitle%7Ctype&cmlimit=max'

In [51]:
BS_df = execute_category_query("business software")
BS_df.tail()

Unnamed: 0,ns,pageid,title,type
323,14,26651713,Category:Tax software,subcat
324,14,53417207,Category:Telecommunications Billing Systems,subcat
325,14,9186941,Category:Workflow technology,subcat
326,14,23739219,Category:Industry-specific XML-based standards,subcat
327,14,9622164,Category:Business software stubs,subcat


In [52]:
wikipedia.page("Category:Business_Software").pageid


'1744470'

In [53]:
category_mask_BS = BS_df["title"].str.contains("Category:")


In [54]:
BS_df[category_mask_BS].head()


Unnamed: 0,ns,pageid,title,type
296,14,20727104,Category:Administrative software,subcat
297,14,17799618,Category:Business simulation games,subcat
298,14,14541812,Category:Business software companies,subcat
299,14,37041135,Category:Business software for Linux,subcat
300,14,41517954,Category:Business software for MacOS,subcat


In [55]:
BS_df.shape


(328, 4)

In [57]:
BS_categories_to_query = BS_df[~category_mask_BS]['title'].apply(remove_category)
BS_categories_to_query

0                                      Business software
1                                            AccuSystems
2                               Active policy management
3                          Alexandria (library software)
4                                                Alteryx
5                                            Amadeus CRS
6                                     AMS Device Manager
7                                     Angelfish software
8                              Applicant tracking system
9                                 Application retirement
10     Architecture of Interoperable Information Systems
11                               Asset recovery software
12                                                Avaloq
13                                           Axess (CRS)
14                                                Ayasdi
15                                    Balanced scorecard
16                                  BatchMaster Software
17                             

In [58]:
BS_df[~category_mask_BS].shape

(296, 4)

In [59]:
BS_pages_df = BS_df[~category_mask_BS]
BS_pages_df.head()

Unnamed: 0,ns,pageid,title,type
0,0,1037763,Business software,page
1,0,41270069,AccuSystems,page
2,0,5211212,Active policy management,page
3,0,28502793,Alexandria (library software),page
4,0,44133735,Alteryx,page


In [None]:
# 

In [60]:
content_BS_0= []


In [61]:
not_found = []

for pageid in BS_df[~category_mask_BS]["pageid"]:
    try:
        page =  wikipedia.WikipediaPage(pageid = pageid)
        content = page.content
        title = page.title
        content_BS_0.append((pageid, content, title))
        #print(content_bs_0)
    except:
        not_found.append(pageid)

In [62]:
content_BS_0[:5]


[(1037763,
  "Business software or a business application is any software or set of computer programs used by business users to perform various business functions. These business applications are used to increase productivity, to measure productivity and to perform other business functions accurately.\nBy and large, business software is likely to be developed to meet the needs of a specific business, and therefore is not easily transferable to a different business environment, unless its nature and operation is identical. Due to the unique requirements of each business, off-the-shelf software is unlikely to completely address a company's needs. However, where an on-the-shelf solution is necessary, due to time or monetary considerations, some level of customization is likely to be required. Exceptions do exist, depending on the business in question, and thorough research is always required before committing to bespoke or off-the-shelf solutions.\nSome business applications are interacti

In [63]:
content_BS_df_0 = pd.DataFrame(content_BS_0, columns= ("pageid","content","title"))


In [64]:
content_BS_df_0.head()


Unnamed: 0,pageid,content,title
0,1037763,Business software or a business application is...,Business software
1,41270069,AccuSystems LLC is an American company headqua...,AccuSystems
2,5211212,Active policy management is business-oriented ...,Active policy management
3,28502793,Alexandria is browser based cross-platform lib...,Alexandria (library software)
4,44133735,Alteryx is an American computer software compa...,Alteryx


In [65]:
BS_categories_list = BS_df[category_mask_BS]['title'].str.replace("Category:", "").tolist()


In [66]:
BS_df[category_mask_BS].shape


(32, 4)

In [67]:
len(BS_categories_list)


32

In [68]:
cat_df_BS = pd.DataFrame(columns=BS_df.columns)

if len(BS_categories_list) > 0:
    for cat in BS_categories_list:
        cat_list_BS = execute_category_query(cat)
         
        cat_df_BS = cat_df_BS.append(cat_list_BS)

In [69]:
cat_df_BS.shape


(1544, 4)

In [70]:
cat_df_BS = cat_df_BS.drop_duplicates().reset_index(drop=True)


In [71]:
cat_df_BS.shape


(1396, 4)

In [72]:
cat_df_BS["ns"].value_counts()


0.0      1311
14.0       77
10.0        3
2.0         3
118.0       2
Name: ns, dtype: int64

In [73]:
cat_df_BS["ns"] = cat_df_BS["ns"].astype(int)


In [74]:
cat_df_BS["pageid"] = cat_df_BS["pageid"].astype(int)


In [75]:
cat_df_BS.head()


Unnamed: 0,ns,pageid,title,type
0,0,26722741,1DayLater,page
1,0,11595731,Act! CRM,page
2,0,3277841,Appointment scheduling software,page
3,0,13589812,Child care management software,page
4,0,4102341,Church software,page


In [76]:
cat_df_BS["ns"].value_counts()

0      1311
14       77
10        3
2         3
118       2
Name: ns, dtype: int64

In [77]:
category_mask_BS1 = cat_df_BS["title"].str.contains("Category:")


In [78]:
cat_df_BS[category_mask_BS1].head()


Unnamed: 0,ns,pageid,title,type
20,14,22516736,Category:School-administration software,subcat
21,14,44406061,Category:Time-tracking software,subcat
210,14,27721340,Category:Amusement park simulation games,subcat
211,14,44922677,Category:M.U.L.E.,subcat
212,14,13216268,Category:Roller coaster games and simulations,subcat


In [79]:
BS_category2 = cat_df_BS[category_mask_BS1]
BS_category2.head()

Unnamed: 0,ns,pageid,title,type
20,14,22516736,Category:School-administration software,subcat
21,14,44406061,Category:Time-tracking software,subcat
210,14,27721340,Category:Amusement park simulation games,subcat
211,14,44922677,Category:M.U.L.E.,subcat
212,14,13216268,Category:Roller coaster games and simulations,subcat


In [80]:
categories_BS_page = cat_df_BS[~category_mask_BS1]['title'].apply(remove_category)


In [81]:
cat_df_BS[~category_mask_BS1].shape


(1319, 4)

In [82]:
BS_pages_df1 = cat_df_BS[~category_mask_BS1]
BS_pages_df1.head()

Unnamed: 0,ns,pageid,title,type
0,0,26722741,1DayLater,page
1,0,11595731,Act! CRM,page
2,0,3277841,Appointment scheduling software,page
3,0,13589812,Child care management software,page
4,0,4102341,Church software,page


In [83]:
content_BS = []


In [84]:
not_found = []

for pageid in cat_df_BS[~category_mask_BS1]["pageid"]:
    try:
        page =  wikipedia.WikipediaPage(pageid = pageid)
        content = page.content
        title = page.title
        content_BS.append((pageid, content, title))
        
    except:
        not_found.append(pageid)

In [85]:
content_BS[:2]

[(26722741,
  '1DayLater was free, web-based software that was focused on professional invoicing and collaborative working. It was aimed at companies of any size; from solo workers to large business enterprises. The company was closed and customer data returned in October 2013.\nThe main function of 1DayLater was to help users create Invoices for clients. It could also be used to build Quotes and Estimates, to track time and other expenses, work to budgets, and to track projects. Multiple users could simultaneously work on the same projects together.\nThe software was developed by two brothers, Paul and David King; after they experienced similar frustrations while working freelance, the brothers wanted to create a product that would let them track time, expenses and business miles in a single online location.\nPCmag voted 1DayLater as one of the \'Best Free Software of 2010\' \n\n\n== History ==\n1DayLater Ltd was formed in 2009 and had one commercial product of the same name. It remai

In [86]:
content_BS_df = pd.DataFrame(content_BS, columns= ("pageid","content","title"))


In [87]:
content_BS_df.head()


Unnamed: 0,pageid,content,title
0,26722741,"1DayLater was free, web-based software that wa...",1DayLater
1,11595731,Act! (previously known as Sage ACT! 2010–2013)...,Act! CRM
2,3277841,Appointment scheduling software allows busines...,Appointment scheduling software
3,13589812,Child care management software also referred t...,Child care management software
4,4102341,Church software is any type of computer softwa...,Church software


In [88]:
all_contents_df = content_BS_df_0.append(content_BS_df, ignore_index= True)


In [89]:
all_contents_df.shape
#why this is dublicate?

(1607, 3)

In [90]:
BS_sub_categories_list = cat_df_BS[category_mask_BS1]["title"].str.replace("Category:", "").tolist()


In [91]:
len(BS_sub_categories_list)
#why the lenth is shorter?

77

In [92]:
sub_cat_df = pd.DataFrame(columns=cat_df_BS.columns)

if len(BS_sub_categories_list) > 0:
    for cat in BS_sub_categories_list:
        sub_cat_list_BS = execute_category_query(cat)
        
        sub_cat_df = sub_cat_df.append(sub_cat_list_BS)

In [93]:
sub_cat_df.shape


(2324, 4)

In [94]:
sub_cat_df = sub_cat_df.drop_duplicates().reset_index(drop=True)


In [95]:
sub_cat_df.shape


(1966, 4)

In [96]:
sub_cat_df["ns"].value_counts()


0.0      1820
14.0      132
2.0         5
6.0         4
10.0        4
118.0       1
Name: ns, dtype: int64

In [97]:
category_mask_BS2 = sub_cat_df["title"].str.contains("Category:")


In [98]:
BS_category3 = sub_cat_df[category_mask_BS2]
BS_category3.head()

Unnamed: 0,ns,pageid,title,type
112,14.0,34299255.0,Category:Association football management video...,subcat
113,14.0,20099866.0,Category:Sports management MMORPGs,subcat
114,14.0,19738196.0,Category:Sports Mogul,subcat
151,14.0,14421425.0,Category:Space trading and combat simulators,subcat
234,14.0,26988907.0,Category:Banking software companies,subcat


In [99]:
sub_categories_BS_page = sub_cat_df[~category_mask_BS2]['title'].str.replace("Category:","")


In [100]:
sub_cat_df[~category_mask_BS2].shape

(1834, 4)

In [101]:
BS_pages_df2 = sub_cat_df[~category_mask_BS2]
BS_pages_df2.head()

Unnamed: 0,ns,pageid,title,type
0,0.0,15549320.0,Automate the Schools,page
1,0.0,4926702.0,BCeSIS,page
2,0.0,40247755.0,Education Management Information System,page
3,0.0,20515903.0,Electronic grade book,page
4,0.0,24810600.0,Fedena,page


In [102]:
content_BS_1 =[]


In [103]:
not_found = []

for pageid in sub_cat_df[~category_mask_BS2]["pageid"]:
    try:
        page =  wikipedia.WikipediaPage(pageid = pageid)
        content = page.content
        title = page.title
        content_BS_1.append((pageid, content, title))
        
    except:
        not_found.append(pageid)

In [104]:
content_BS_1[:5]

[(15549320.0,
  "Automate The Schools (ATS) is the school-based administrative system used by all New York City public schools since 1988. It has many functions, including recording biographical data for all students, handling admissions, discharges, and transfers to other schools, and recording other student-specific data, such as exam scores, grade levels, attendance, and immunization records. It also provides aggregate student and human resources data to school administrators.\nAccess to the ATS system is strictly limited to school system personnel; however, much of the non-personally identifiable information is available online at the New York City Department of Education website.\n\n\n== Technical details ==\nThe software was written in under six months using the Computer Corporation of America's Model 204 database management software.\n\n\n== References ==\n\n\n== External links ==\nNew York City Department of Education website, which contains much of the non-personally identifia

In [111]:
content_BS_df_1 = pd.DataFrame(content_BS_1, columns= ("pageid","content","title"))


In [112]:
content_BS_df_1.head()


Unnamed: 0,pageid,content,title
0,15549320.0,Automate The Schools (ATS) is the school-based...,Automate the Schools
1,4926702.0,BCeSIS (the British Columbia Enterprise Studen...,BCeSIS
2,40247755.0,An Educational Management Information System (...,Education Management Information System
3,20515903.0,An electronic grade book is a teacher's online...,Electronic grade book
4,24810600.0,Fedena is an open source school management sof...,Fedena


In [114]:
all_contents_df = all_contents_df.append(content_BS_df_1, ignore_index= True)


In [115]:
all_contents_df.shape


(3429, 3)

In [116]:
all_contents_df["category"] = "Business_Software"


In [117]:
all_contents_df.head()


Unnamed: 0,pageid,content,title,category
0,1037763.0,Business software or a business application is...,Business software,Business_Software
1,41270069.0,AccuSystems LLC is an American company headqua...,AccuSystems,Business_Software
2,5211212.0,Active policy management is business-oriented ...,Active policy management,Business_Software
3,28502793.0,Alexandria is browser based cross-platform lib...,Alexandria (library software),Business_Software
4,44133735.0,Alteryx is an American computer software compa...,Alteryx,Business_Software


In [118]:
all_contents_df.shape


(3429, 4)

In [119]:
all_contents_df_drop= all_contents_df.drop_duplicates(subset=["pageid"], keep="first")


In [120]:
all_contents_df_drop.shape


(3028, 4)

In [121]:
all_contents_df_drop.head()


Unnamed: 0,pageid,content,title,category
0,1037763.0,Business software or a business application is...,Business software,Business_Software
1,41270069.0,AccuSystems LLC is an American company headqua...,AccuSystems,Business_Software
2,5211212.0,Active policy management is business-oriented ...,Active policy management,Business_Software
3,28502793.0,Alexandria is browser based cross-platform lib...,Alexandria (library software),Business_Software
4,44133735.0,Alteryx is an American computer software compa...,Alteryx,Business_Software


In [122]:
all_contents_df = all_contents_df_drop


In [123]:
all_contents_df.shape


(3028, 4)

In [127]:
all_contents_df2 = all_contents_df.append(content_ML_df, ignore_index= True)


In [128]:
all_contents_df2.shape


(4130, 4)

In [129]:
all_contents_df2["pageid"] = all_contents_df2["pageid"].astype(int)


In [130]:
all_contents_df2.head()


Unnamed: 0,pageid,content,title,category
0,1037763,Business software or a business application is...,Business software,Business_Software
1,41270069,AccuSystems LLC is an American company headqua...,AccuSystems,Business_Software
2,5211212,Active policy management is business-oriented ...,Active policy management,Business_Software
3,28502793,Alexandria is browser based cross-platform lib...,Alexandria (library software),Business_Software
4,44133735,Alteryx is an American computer software compa...,Alteryx,Business_Software


In [131]:
all_BS_category1 = BS_category1.append(BS_category2)
all_BS_category1 = all_BS_category1.append(BS_category3)
all_BS_category1["category"] = "Business_Software"

In [132]:
all_BS_category1 = all_BS_category1[["title","category"]]
all_BS_category1.head()

Unnamed: 0,title,category
296,Category:Administrative software,Business_Software
297,Category:Business simulation games,Business_Software
298,Category:Business software companies,Business_Software
299,Category:Business software for Linux,Business_Software
300,Category:Business software for MacOS,Business_Software


In [133]:
ML_category.head()


Unnamed: 0,title,category
200,Category:Applied machine learning,Machine_Learning
201,Category:Artificial neural networks,Machine_Learning
202,Category:Bayesian networks,Machine_Learning
203,Category:Classification algorithms,Machine_Learning
204,Category:Cluster analysis,Machine_Learning


In [134]:
all_categories_1 =  all_BS_category1.append(ML_category ,ignore_index= True)
all_categories_1.shape

(271, 2)

In [135]:
all_categories_1.head()


Unnamed: 0,title,category
0,Category:Administrative software,Business_Software
1,Category:Business simulation games,Business_Software
2,Category:Business software companies,Business_Software
3,Category:Business software for Linux,Business_Software
4,Category:Business software for MacOS,Business_Software


## Contents .csv

In [136]:
all_contents_df2.to_csv("contents.csv", index= False)
