# **ANÁLISIS ITERATIVO DE LAS CATEGORÍAS IDEOLÓGICAS EN WIKIPEDIA**
#### Ramón Zamora, carné 10233

## INTRODUCCIÓN
El presente proyecto utiliza herramientas de análisis de texto computacional para llevar a cabo un exploración de los datos de los artículos vinculados a las categorías de distintas ideologías que se pueden encontrar en la enciclopedia digial Wikipedia. Para llevar a cabo este ejercicio se utilizarán las librerías de ...

In [1]:
import wikipediaapi
import pickle
from pprint import pprint
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from dcss.text import bigram_process, preprocess, bow_to_df
from dcss.plotting import format_axes_commas, custom_seaborn
from dcss.utils import sparse_groupby
custom_seaborn()
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import scipy
from itertools import islice

In [2]:
wiki_wiki = wikipediaapi.Wikipedia('Ciencias Sociales Computacionales (lup22776@uvg.edu.gt)', 'en')

In [3]:
#1. Busqueda de Anarquismo
cat_anarchism = wiki_wiki.page("Category:Anarchism")
print("Anarchism exist: %s" % cat_anarchism.exists())
#2. Busqueda de Autoritarismo
cat_authoritarianism = wiki_wiki.page("Category:Authoritarianism")
print("Authoritarianism exist: %s" % cat_authoritarianism.exists())
# 3. Busqueda de Pensamiento comunitario
cat_communitarianism = wiki_wiki.page("Category:Communitarianism")
print ("Communitarianism exist: %s" % cat_communitarianism.exists())
# 4. Busqueda de Comunismo
cat_communism = wiki_wiki. page ("Category:Communism")
print ("Communism exist: %s" % cat_communism.exists())
# 5. Busqueda de Conservadurismo
cat_conservatism = wiki_wiki.page("Category:Conservatism")
print ("Conservatism exist: %s" % cat_conservatism.exists())
# 6. Busqueda de Corporativismo
cat_corporatism = wiki_wiki.page("Category:Corporatism")
print ("Corporatism exist: %s" % cat_corporatism.exists())
# 7. Busqueda de Ambiantalismo
cat_environmentalism = wiki_wiki.page("Category:Environmentalism")
print ("Environmentalism exist: %s" % cat_environmentalism.exists())
# 8. Busqueda de Facismo
cat_fascism = wiki_wiki.page("Category:Fascism")
print ("Fascism exist: %s" % cat_fascism.exists())
# 9. Fascism de Liberalismo
cat_liberalism = wiki_wiki.page("Category:Liberalism")
print ("Liberalism exist: %s" % cat_liberalism.exists())
# 10. Busqueda de Libterarismo
cat_libertarianism = wiki_wiki.page("Category:Libertarianism")
print ("Libertarianism exist: %s" % cat_libertarianism.exists())
# 11. Busqueda de Nacionalismo
cat_nationalism = wiki_wiki.page("Category:Nationalism")
print ("Nationalism exist: %s" % cat_nationalism.exists())
# 12. Busqueda de Populismo
cat_populism = wiki_wiki.page("Category:Populism")
print ("Populism exist: %s" % cat_populism.exists())

Anarchism exist: True
Authoritarianism exist: True
Communitarianism exist: True
Communism exist: True
Conservatism exist: True
Corporatism exist: True
Environmentalism exist: True
Fascism exist: True
Liberalism exist: True
Libertarianism exist: True
Nationalism exist: True
Populism exist: True


In [4]:
data = []
pages_names = cat_anarchism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_anarchism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_authoritarianism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_authoritarianism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_communitarianism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_communitarianism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_communism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_communism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_conservatism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_conservatism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_corporatism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_corporatism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_environmentalism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_environmentalism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_fascism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_fascism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_liberalism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_liberalism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_libertarianism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_libertarianism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_nationalism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_nationalism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

pages_names = cat_populism.categorymembers
for key, value in islice (pages_names.items(), 1, None):
    page_py = wiki_wiki.page(key)
    data.append(
        {
            'Category': cat_populism,
            'Namespace': page_py.ns,
            'Title': page_py.title,
            'Summary': page_py.summary,
            'Content': page_py.sections
        }
    )

df = pd.DataFrame(data)

In [5]:
df.to_csv('ideologias.csv')

In [6]:
columns = ['Category', 'Namespace', 'Title', 'Summary', 'Content']
wiki_df = pd.read_csv("ideologias.csv")
wiki_df.dropna(subset = ['Title', 'Summary'], inplace=True)
wiki_df = wiki_df.query('Namespace < 1')
wiki_df['Category'].value_counts()

Category
Category:Authoritarianism (id: 36055803, ns: 14)    150
Category:Environmentalism (id: 737047, ns: 14)      124
Category:Liberalism (id: 724440, ns: 14)            108
Category:Communism (id: 722710, ns: 14)              95
Category:Nationalism (id: 846456, ns: 14)            87
Category:Conservatism (id: 780171, ns: 14)           75
Category:Corporatism (id: 21722509, ns: 14)          73
Category:Fascism (id: 889991, ns: 14)                68
Category:Populism (id: 19997168, ns: 14)             62
Category:Anarchism (id: 780754, ns: 14)              44
Category:Libertarianism (id: 1543366, ns: 14)        39
Category:Communitarianism (id: 49526307, ns: 14)     16
Name: count, dtype: int64

In [7]:
content_keep = [
    'Category:Authoritarianism (id: 36055803, ns: 14)',
    'Category:Environmentalism (id: 737047, ns: 14)',
    'Category:Liberalism (id: 724440, ns: 14)',
    'Category:Communism (id: 722710, ns: 14)',
    'Category:Nationalism (id: 846456, ns: 14)',
    'Category:Conservatism (id: 780171, ns: 14)',
    'Category:Corporatism (id: 21722509, ns: 14)',
    'Category:Fascism (id: 889991, ns: 14)',
    'Category:Populism (id: 19997168, ns: 14)',
    'Category:Anarchism (id: 780754, ns: 14)',
    'Category:Libertarianism (id: 1543366, ns: 14)',
    'Category:Communitarianism (id: 49526307, ns: 14)'
]
content_subset = wiki_df[wiki_df['Category'].isin(content_keep)].copy()
content_subset.reset_index(drop=True, inplace=True)
total_content_counts = content_subset['Category'].value_counts()
total_content_counts

Category
Category:Authoritarianism (id: 36055803, ns: 14)    150
Category:Environmentalism (id: 737047, ns: 14)      124
Category:Liberalism (id: 724440, ns: 14)            108
Category:Communism (id: 722710, ns: 14)              95
Category:Nationalism (id: 846456, ns: 14)            87
Category:Conservatism (id: 780171, ns: 14)           75
Category:Corporatism (id: 21722509, ns: 14)          73
Category:Fascism (id: 889991, ns: 14)                68
Category:Populism (id: 19997168, ns: 14)             62
Category:Anarchism (id: 780754, ns: 14)              44
Category:Libertarianism (id: 1543366, ns: 14)        39
Category:Communitarianism (id: 49526307, ns: 14)     16
Name: count, dtype: int64

In [8]:
sampled_contents = content_subset.groupby('Category').sample(replace=False, frac=.5, random_state=23)
len(sampled_contents)

472

In [9]:
with open('sampled_wiki_ideologias_content.pkl', 'wb') as fp:
    pickle.dump(sampled_contents, fp)
sampled_contents_counts = sampled_contents['Category'].value_counts()
sample_sizes = pd.DataFrame(zip(total_content_counts, sampled_contents_counts), columns=['Total', 'Sample'], index=content_keep)

In [10]:
sampled_contents['content_len'] = sampled_contents['Content'].apply(lambda x: len(x.split()))

In [1]:
categories = sampled_contents.groupby('Category')
def categoria_subplot(subgroup, title, position):
    sns.kdeplot(ax = position, data=subgroup, x='content_len', log_scale=True, fill=True, alpha=.5, linewidth=0, color='black')
    position.set(xlabel='Number of tokens (log scale)', title=title)
fig, ax = plt.subplots(3, 4, sharex=True, sharey=True, figsize(10, 6))

categoria_subplot(categories.get_group('Category:Authoritarianism (id: 36055803, ns: 14)'), 'Authoritarianism', ax[0,0])
categoria_subplot(categories.get_group('Category:Environmentalism (id: 737047, ns: 14)'), 'Environmentalism', ax[0,1])
categoria_subplot(categories.get_group('Category:Liberalism (id: 724440, ns: 14)'), 'Liberalism', ax[0,2])
categoria_subplot(categories.get_group('Category:Communism (id: 722710, ns: 14)'), 'Communism', ax[0,3])
categoria_subplot(categories.get_group('Category:Nationalism (id: 846456, ns: 14)'), 'Nationalism', ax[1,0])
categoria_subplot(categories.get_group('Category:Conservatism (id: 780171, ns: 14)'), 'Conservatism', ax[1,1])
categoria_subplot(categories.get_group('Category:Corporatism (id: 21722509, ns: 14)'), 'Corporatism', ax[1,2])
categoria_subplot(categories.get_group('Category:Fascism (id: 889991, ns: 14)'), 'Fascism', ax[1,3])
categoria_subplot(categories.get_group('Category:Populism (id: 19997168, ns: 14)'), 'Populism', ax[2,0])
categoria_subplot(categories.get_group('Category:Anarchism (id: 780754, ns: 14)'), 'Anarchism', ax[2,1])
categoria_subplot(categories.get_group('Category:Libertarianism (id: 1543366, ns: 14)'), 'Libertarianism', ax[2,2])
categoria_subplot(categories.get_group('Category:Communitarianism (id: 49526307, ns: 14)'), 'Communitarianism', ax[2,3])

plt.tight_layout()
plt.show()
fig.savefig("graficadensidadWiki")

SyntaxError: unmatched ')' (2551215278.py, line 18)