Get wikipedia's n-grams base on the dump data

# Download dump

In [None]:
import os
The_path=r"C:/Users/Reza/Desktop/core"
Dump_URL =u'https://dumps.wikimedia.org/fawiki/latest/fawiki-latest-pages-articles.xml.bz2'
# download last dump
os.system('wget '+Dump_URL +" "+The_path+"/fawiki-latest-pages-articles.xml.bz2")

# Feature extraction 

This code extracts features from articles:
* Article text file
* Category of the article
* Internal links inside the articles
* File and images in articles


In [1]:
import pickle
import sys , re
# Load pywikibot library
sys.path.append(The_path)
from pywikibot import xmlreader
dump = xmlreader.XmlDump(The_path+r"/fawiki-latest-pages-articles.xml.bz2")

farsi_char=r'ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوآؤئيإأةكژ‌'# Persian Char + ZWNJ
suported_extenstions=r'(?:[gt]iff?|png|jpe?g|web[mp]|xcf|pdf|mid|og[avg]|svg|djvu|flac|opus|wav|mp3)'

def flat_text (text):
    # clean and remove wiki sintax
    text=text.replace('\r','')
    text=re.sub(r'\{\{[^\}]+\}\}',' ',text)# remove wiki templates
    text=re.sub('\[\[(?:رده|پرونده|تصویر)\:.*?\]\]','',text) # remove categories and images
    text=re.sub(r'\<[^\>]+\>',' ',text) # remove commented texts
    text=re.sub(r'\n={1,}[^\n]+\={1,}',' ',text) # remove subtitles
    text=re.sub(r'\|[^\=\]\n]+\=',' ',text) # remove remained template variables
    text=re.sub(r'\[\[[^\|\]]+\|','',text) # to have correct text. I remove the piped text. eg. This is a [[foo|bar]] > This is bar
    text=re.sub(r'\.'+suported_extenstions,'',text)# remove useless . 
    text=text.replace('&ndash;',' ')# Replaced html space to not connect words
    text=text.replace('[[',' ').replace(']]','')# replaced ]] and [[ with empty to connect the connected char after them. eg. [[book]]s > books
    
    text=re.sub(r'[^'+farsi_char+'\.]+',' ',text)# removed none persian characters except . to find sentences
    text=re.sub(r'\.{1,}','.',text)# remove repeated .
    text=re.sub(r'(\. ){1,}','. ',text)# remove repeated .
    
    text=re.sub(r'\s+',' ',text)# Replace \n and space with one space
    text=text.replace(' . ','. ')
    return text

def get_images(text):
    images=re.findall(r'(?:[Ff]ile|[Ii]mage|پرونده)\:.*?\.'+suported_extenstions,text)
    if images:
        images=list(images)
    else:
        images=[]
    return images

def get_categories(text):
    cats=re.findall(r'\[\[(?:[Cc]ategory|رده)\:([^\}\{\|\]\[\n\<\>\#\\\:a-zA-Z0-9]+)\]\]',text)
    if cats:
        cats=list(cats)
    else:
        cats=[]
    return cats

def get_links(text):
    text=re.sub('\[\[(?:رده|پرونده|تصویر)\:.*?\]\]','',text)
    
    links=re.findall(r'\[\[([^\}\{\|\]\[\n\<\>\#\\\:a-zA-Z0-9]+)(?:\|.*?)\]\]',text)
    if links:
        links=list(links)
    else:
        links=[]
    return links

category_dict={}
links_dict={}
image_dict={}
title_dict={}
text_dict={}

counter=0
for entry in dump.parse():
    if entry.ns =='0' and not entry.isredirect:
        counter+=1
        text_dict[entry.id]=flat_text(entry.text)
        title_dict[entry.id]=entry.title
        image_dict[entry.id]=get_images(entry.text)
        links_dict[entry.id]=get_links(entry.text)
        category_dict[entry.id]=get_categories(entry.text)
        if counter % 50000==0:
            print(counter)


50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000


## Save Data

In [2]:
with open('text_dict.pickle', 'wb') as handle:
    pickle.dump(text_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('title_dict.pickle', 'wb') as handle:
    pickle.dump(title_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('image_dict.pickle', 'wb') as handle:
    pickle.dump(image_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('links_dict.pickle', 'wb') as handle:
    pickle.dump(links_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('category_dict.pickle', 'wb') as handle:
    pickle.dump(category_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Load data

In [1]:
import pickle
# Load text lists
pickle_in = open("text_dict.pickle","rb")
text_dict = pickle.load(pickle_in)
text_list=text_dict.values()

'''

total_text=' '.join(text_list)

pickle_in = open("title_dict.pickle","rb")
title_dict = pickle.load(pickle_in)

pickle_in = open("image_dict.pickle","rb")
image_dict = pickle.load(pickle_in)

pickle_in = open("links_dict.pickle","rb")
links_dict = pickle.load(pickle_in)

pickle_in = open("category_dict.pickle","rb")
category_dict = pickle.load(pickle_in)

'''
del text_dict

# Bio-grams

get the most frequent words at the articles

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import operator

In [3]:
# Convert text list to the string
my_text=' '.join(text_list)
# Remove .
my_text=my_text.replace('.','')
# Split text base on the space
words=my_text.split(' ')
# make data frame
df=pd.DataFrame(pd.Series(words),columns=['C'])
df=pd.DataFrame(df.C.value_counts())
df.to_csv('reza.csv', sep=',', encoding='utf-8')
df.head()

Unnamed: 0,C
در,3756517
و,3577758
به,2307853
از,2167464
که,1531830


## N-grams

In [4]:
total_List=[]
text_list=list(text_list)

def get_ngram(my_list):
    vectorizer = CountVectorizer(ngram_range=(2,4))
    vectorizer.fit(my_list)
    vector_count = vectorizer.transform(my_list)
    # organize the result
    my_dict=dict(zip(vectorizer.get_feature_names(), np.asarray(vector_count.sum(axis=0))[0]))
    sorted_x = sorted(my_dict.items(), key=operator.itemgetter(1),reverse=True)
    if len(sorted_x)>20000:
        return sorted_x[:20000]
    else:
        return sorted_x

for i in range(0,667100,100):
    text_list2=text_list[i:i+1]
    my_dict=get_ngram (text_list2)
    if my_dict:
        total_List.append(my_dict)
    if i%100000==0:
        print (i)

0
100000
200000
300000
400000
500000
600000


### Join N-grams

In [12]:
new_list=[]
for i in total_List:
    new_list+=i
# convert total list to data frame
df=pd.DataFrame(new_list,columns=['N-garm','N'])
# sum duplicated rows
df['Frequency'] = df.groupby(['N-garm'])['N'].transform('sum')
new_df = df.drop_duplicates(subset=['N-garm'])
# sort rows
new_df=new_df.sort_values(['Frequency'], ascending=[False])
new_df=new_df.drop(columns='N')
# save n-grams
new_df.to_csv('n-grams.csv', sep=',', encoding='utf-8')
new_df.reset_index(inplace=True)
new_df.drop(columns='index',inplace=True)
new_df.head()

Unnamed: 0,N-garm,Frequency
0,است که,4121
1,که در,3982
2,شده است,3606
3,در سال,3331
4,می شود,2676
