In [None]:
#https://dumps.wikimedia.org/tcywiki/20230320/ -- SOURCE OF DATA! Download of data and then replace the file paths accordingly ^^

import pandas as pd

In [None]:
def contains_only_ascii(text):
    return all(ord(char) < 128 for char in text)

In [None]:
# cl_from: Stores the page.page_id of the article where the link was placed.

# cl_to: Stores the name of the desired category in the page_title format (that is, with _ and excluding namespace prefix).

# cl_sortkey: Stores the title by which the page should be sorted in a category list.
# This is the binary sortkey, that depending on $wgCategoryCollation may or may not be readable by a human
# (but should sort in correct order when comparing as a byte string), and is not valid UTF-8 whenever the database truncates the sortkey in the middle of a multi-byte sequence.

# cl_timestamp: Stores the time at which that link was last updated in the table.

# cl_sortkey_prefix: This is either the empty string if a page is using the default sortkey (aka the sortkey is unspecified).

# Otherwise it is the human readable version of cl_sortkey. Needed mostly so that cl_sortkey can be easily updated in certain situations without re-parsing the entire page.
# More recently added values are valid UTF-8 (see change 449280 on Gerrit).

# cl_collation: What collation is in use. Used so that if the collation changes, the updateCollation.php script knows what rows need to be fixed in db.

# cl_type: What type of page is this (file, subcat (subcategory) or page (normal page)). Used so that the different sections on a category page
# can be paged independently in an efficient manner.




# CL.CL_FROM -> PAGE.PAGE_ID -> PAGE_TITLE where PAGE_NAMESPACE is 0
cl_df = pd.read_csv('/content/drive/MyDrive/Research/tcy_wiki/categorylinks.csv', on_bad_lines ='skip')
cl_df['contains_only_english'] = cl_df['cl_to'].apply(contains_only_ascii)
cl_df = cl_df[cl_df['contains_only_english'] == False]
cl_df = cl_df.drop(['cl_sortkey', 'cl_sortkey_prefix', 'cl_timestamp', 'cl_collation', 'contains_only_english'], axis=1)
cl_df

Unnamed: 0,cl_from,cl_to,cl_type
1,6,ಊರುಲು,page
2,6,ಜಿಲ್ಲೆಲು,page
3,6,ತುಳುನಾಡ್,page
17,24,ಟೆಂಪ್ಲೇಟುಲು,page
20,78,ವಿಜ್ಞಾನ,subcat
...,...,...,...
8690,13806,CS1_ಇಂಗ್ಲಿಷ್-language_sources_(en),page
8691,13806,ಪೊಂಜೊವುಲು,page
8692,13806,ವ್ಯಕ್ತಿಲು,page
8693,13807,ಸ್ತ್ರೀವಾದೊ_ಬುಕ್ಕೊ_ಜಾನಪದೊ_2023,page


In [None]:
page_df = pd.read_csv('/content/drive/MyDrive/Research/tcy_wiki/page.csv', on_bad_lines ='skip')
page_df = page_df.drop(['page_is_new', 'page_random', 'page_touched', 'page_links_updated', 'page_latest', 'page_len','page_content_model', 'page_lang'], axis=1)
page_df = page_df[page_df['page_namespace'] == 0]
page_df

Unnamed: 0,page_id,page_namespace,page_title,page_is_redirect
0,1,0,ಮುಖ್ಯ_ಪುಟ,0
5,6,0,ಕಾಸರಗೋಡ್,0
86,92,0,ತೊಪ್ಪುಲು,0
168,3572,0,'ವಿಕಿಪೀಡಿಯ:ಚಾವಡಿ'/ತುಳು_ವಿಕಿಪೀಡಿಯ_ಕಜ್ಜಕೊಟ್ಯ,1
169,3573,0,.ಅಂಬೇಡ್ಕರ್,1
...,...,...,...,...
9766,13791,0,ಪ್ರಮೋದಾ_ಕೆ_ಸುವರ್ಣ,0
9773,13798,0,ಸುಕೀರ್ತಿ_ಕಾಂಡ್ಪಾಲ್,0
9777,13802,0,ರತಿ_ಪಾಂಡೆ,0
9780,13805,0,ಜನ್ನತ್_ಜುಬೈರ್_ರಹಮಾನಿ,0


In [None]:
print(cl_df.cl_type.unique())

['page' 'subcat' 'file']


In [None]:
cl_df[cl_df['cl_type'] == 'file']

Unnamed: 0,cl_from,cl_to,cl_type
8414,13424,ತುಳು_ಚಿತ್ರ_ನಟೆರ್,file
8415,13424,ತುಳು_ಸಿನಿಮೊ,file
8416,13424,ತುಳುವೆರ್,file


In [None]:
merged_df = pd.merge(cl_df, page_df, left_on='cl_from', right_on='page_id', how='left')

merged_df = merged_df[merged_df['page_namespace'] == 0]
title_df = merged_df.drop(['page_id', 'page_namespace'], axis=1)
title_df.to_csv('title.csv',encoding='utf-16')

In [None]:
# https://github.com/attardi/wikiextractor
!pip install wikiextractor
!python -m wikiextractor.WikiExtractor "/content/drive/MyDrive/Research/tcy_wiki/tcywiki-20230320-pages-articles.xml.bz2"

INFO: Preprocessing '/content/drive/MyDrive/Research/tcy_wiki/tcywiki-20230320-pages-articles.xml.bz2' to collect template definitions: this may take some time.
INFO: Loaded 1828 templates in 3.3s
INFO: Starting page extraction from /content/drive/MyDrive/Research/tcy_wiki/tcywiki-20230320-pages-articles.xml.bz2.
INFO: Using 1 extract processes.
INFO: Finished 1-process extraction of 2360 articles in 3.9s (608.2 art/s)


In [None]:
import xml.etree.ElementTree as ET
import os
import pandas as pd

directory = '/content/text/AA'
file_names = os.listdir(directory)

all_results = []

for file_name in file_names:
    with open(os.path.join(directory, file_name), 'r') as file:
        content = file.read()

        if content.startswith('<doc'):
            content = f'<root>{content}</root>'

            root = ET.fromstring(content)

            for doc_element in root.findall('doc'):
                title = doc_element.get('title')
                innertext = doc_element.text.strip()
                all_results.append([title, innertext])

article_df = pd.DataFrame(all_results, columns=['title', 'text'])

article_df

Unnamed: 0,title,text
0,ಗದಗ್,ಗದಗ್\n\nಗದಗ ಕರ್ನಾಟಕ ರಾಜ್ಯತ ಒಂಜಿ ಜಿಲ್ಲೆ. ಉ೦ದು ಪ...
1,ಗರ್ಗೊ,ಗರ್ಗೊ\n\n \nಗರ್ಗೊ ಪನ್ಪುನವು ಒಂಜಿ ಎಲ್ಯ ಮರ್ದ್ ದಯಿ...
2,ಗಿಡಿ,ಗಿಡಿ\n\nಗರುಡ (ಹಕ್ಕಿ) ಪಕ್ಕಿಲೆಡೇ ಒಂಜಿ ಮಲ್ಲ ಪಕ್ಕಿ...
3,ಗರುಢ ಪಕ್ಕಿ,ಗರುಢ ಪಕ್ಕಿ
4,ಗಾಣ,"ಗಾಣ\n\nಗಾಣ ಪಂಡ್ಂಡ ತಾರಾಯಿ, ಎನ್ಮೆ, ಕಡ್ಲೆ ಇಂಚಿತ್ತ..."
...,...,...
2354,ಬಲ,"ಬಲ\n\nಬಲ ಪಂಡ ಅಲೆ, ಪೇರ್‌ನ್ ಬುಕ್ಕ ತೆನಸ್‌ನ್ ಎತ್ತರ..."
2355,ಬಾರ್‌ದ ಮುಡಿ,ಬಾರ್‌ದ ಮುಡಿ\n\nಬೈತ ಸೂಡಿಲೆನ್ ಒಟ್ಟು ಮಲ್ತ್‌‍ದ್ ಬಾ...
2356,ಕಿಷ್ಕಿಂದ,ಕಿಷ್ಕಿಂದ\n\nತ್ರೇತಾಯುಗಟ್ ಶ್ರೀರಾಮೆ ವನವಾಸ ಮಲ್ತೊಂದ...
2357,ಮಾತುಂಗ ಬೆಟ್ಟೊ,ಮಾತುಂಗ ಬೆಟ್ಟೊ\n\nಮಾತುಂಗ ಬೆಟ್ಟೊ ಹಂಪಿದ ಕೈತಲ್ ಉಪ್...


In [None]:
# OPTIONAL - for one hot encoding

import pandas as pd

combined_df = pd.merge(article_df, title_df, left_on='title', right_on='page_title', how='left')

pivot_df = combined_df.pivot_table(index=['title', 'text'], columns='cl_to', aggfunc='size', fill_value=0)

pivot_df = pivot_df.reset_index()
pivot_df

cl_to,title,text,Articles_with_dead_external_links_from_ಆಗಸ್ಟ್_2021,CS1_errors:_URL–wikilink_conflict,CS1_ಅಮೆರಿಕನ್_ಇಂಗ್ಲಿಷ್-language_sources_(en-us),CS1_ಇಂಗ್ಲಿಷ್-language_sources_(en),CS1_ಇಂಡೋನೇಶಿಯನ್-language_sources_(id),CS1_ಕನ್ನಡ-language_sources_(kn),CS1_ಗುಜರಾತಿ-language_sources_(gu),CS1_ಹಿಂದಿ-language_sources_(hi),...,ಹಳೆಗನ್ನಡ_ಕವಿ,ಹಳೆಗನ್ನಡ_ಕವಿಕುಲು,ಹಳ್ಳಿಮರ್ದ್,ಹವ್ಯಾಸಿ_ರಂಗಭೂಮಿ_ಕಲಾವಿದೆ,ಹಿಂದು_ದೇವತೆಲು,ಹಿಂದುಲೆನ_ಆಚರಣೆ,ಹಿಂದೂ_ಧರ್ಮ,ಹಿಂದೂ_ಧರ್ಮಗ್ರಂಥೊಲು,ಹಿಂದೂ_ಪಂಚಾಂಗ,ಹಿಂದೂ_ಸಿದ್ಧಾಂತ
0,ಅ,ಅ\n\nತುಳು ಅಕ್ಷರಮಾಲೆ:ಅ ತುಳುತ ಸುರುತ ಅಕ್ಷರ ಆತ್ಂಡ್...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ಅಂ,ಅಂ\n\nಅಂ ತುಳು ಅಕ್ಷರಮಾಲೆದ ಅನುಸ್ವಾರ ಅಕ್ಷರೊ. ಭಾಷಾ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ಅಂಕೋಲ,ಅಂಕೋಲ\n\nಅಂಕೋಲಾ: ಉಂದು ಕರ್ನಾಟಕೊ ರಾಜ್ಯೊದ ಕರಾವಳಿದ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ಅಂಗಾರಕ,ಅಂಗಾರಕ\n\nಮಂಗಳ (ಗುರ್ತ: ) - ಸೂರ್ಯನ ಸೌರಮಂಡಲದ ನಾಲ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ಅಂಗಿಲೊ,ಅಂಗಿಲೊ\n\nಉಂದು ಉದ್ದೊನ್ ಅಲಪುನವು. \nಅಲತೆ.\nರಡ್ಡ್...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,ಹುಲಿಪುರ,ಹುಲಿಪುರ\n\nಹುಲಿಪುರ ಬಾಂಗ್ಲಾದೇಶೊದ ಕುರಿಗ್ರಾಮ ಜಿಲ್...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1006,ಹೊಯಿಲು,ಹೊಯಿಲು\n\n'ಹೊಯಿಲು' (ಒಯಿಲ್) ಗ್ರಾಮೀಣ ಜನೊಕುಲೆಡ ಬಳ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1007,ಹೊರನಾಡು,ಹೊರನಾಡು\n\nಉಡುಪಿಡುದು ೧೩೫ ಕಿ. ಮೀ. ದೂರ ಉಪ್ಪುನ ಒಂ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1008,ಹೊಸಗುಂದ,ಹೊಸಗುಂದ\n\nಹೊಸಗುಂದ ಪನ್ಪುನ ಜಾಗೆ ಪಿರಾಕ್‌ದ ಕಾಲೊಡು...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
