In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import copy, string, os

In [2]:
class dense_graph:
    def __init__(self, nodes):
        self.nodes = sorted(nodes)
        self.nodes.append("?") # A
        self.node_set = set(nodes)
        self.edges = pd.DataFrame(index=self.nodes, columns=self.nodes)
        self.edges.fillna(1, inplace=True) # B

    def _repr_html_(self):
        with pd.option_context('display.max_columns', None): # C
            return self.edges._repr_html_()

    def inc(self, src, dst):
        if not src in self.node_set:
            src = "?"
        if not dst in self.node_set:
            dst = "?"
        self.edges.at[src, dst] += 1
        
    def get_edge(self, src, dst):
        if not src in self.node_set:
            src = "?"
        if not dst in self.node_set:
            dst = "?"
        return self.edges.at[src, dst] # D
        
    def to_prob(self):
        # E
        g = copy.deepcopy(self)
        row_sums = g.edges.sum(axis=1)
        g.edges = g.edges.div(row_sums, axis=0)
        return g

g = dense_graph("ABC") # F

In [3]:
g.inc("A", "B")
g.inc("A", "B")
g.inc("A", "B")
g.inc("A", "C")
g.inc("B", "D")
g.inc("E", "A")
g

Unnamed: 0,A,B,C,?
A,1,4,2,1
B,1,1,1,2
C,1,1,1,1
?,2,1,1,1


In [4]:
probs = g.to_prob()
probs

Unnamed: 0,A,B,C,?
A,0.125,0.5,0.25,0.125
B,0.2,0.2,0.2,0.4
C,0.25,0.25,0.25,0.25
?,0.4,0.2,0.2,0.2


In [5]:
probs.get_edge("A", "B")

0.5

In [9]:
#Language Examples
#We'll download and extract the text of 7 wikipedia articles:

#5 pages describing the Python programming language, in English, Spanish, German, French, and Italian. We'll later create a model based on each of these.
#2 pages (in English and Spanish) about giant pandas, the animal. We'll test our models to see if we can automatically detect what language these are in.
urls = {
    "english": "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "spanish": "https://es.wikipedia.org/wiki/Python",
    "german": "https://de.wikipedia.org/wiki/Python_(Programmiersprache)",
    "french": "https://fr.wikipedia.org/wiki/Python_(langage)",
    "italian": "https://it.wikipedia.org/wiki/Python",
    "english-test": "https://en.wikipedia.org/wiki/Giant_panda",
    "spanish-test": "https://es.wikipedia.org/wiki/Ailuropoda_melanoleuca",
}

texts = {}

for lang, url in urls.items():
    path = lang + ".txt"
    
    # have we downloaded it before?
    if not os.path.exists(path):
        r = requests.get(url)
        r.raise_for_status()
        page = BeautifulSoup(r.text)
        with open(path, "w") as f:
            f.write(page.get_text())
    
    # for simplicity, strip out everything except lower
    # case English letters, periods, and commas
    with open(path) as f:
        valid = string.ascii_lowercase + " .,"
        text = []
        for c in f.read().lower():
            if c in valid:
                text.append(c)
            else:
                text.append("?")
        texts[lang] = "".join(text)

In [10]:
list(texts.keys())

['english',
 'spanish',
 'german',
 'french',
 'italian',
 'english-test',
 'spanish-test']

In [11]:
print(texts["english"][:1000])

????python ?programming language? ? wikipedia?document.documentelement.classname??client?js??rlconf???wgbreakframes????,?wgseparatortransformtable?????,???,?wgdigittransformtable?????,???,?wgdefaultdateformat???dmy?,?wgmonthnames?????,?january?,?february?,?march?,?april?,?may?,?june?,?july?,?august?,?september?,?october?,?november?,?december??,?wgmonthnamesshort?????,?jan?,?feb?,?mar?,?apr?,?may?,?jun?,?jul?,?aug?,?sep?,?oct?,?nov?,?dec??,?wgrequestid???xk??xgpaaeaaacb??n?aaadj?,?wgcspnonce????,?wgcanonicalnamespace????,?wgcanonicalspecialpagename????,?wgnamespacenumber???,?wgpagename???python??programming?language??,?wgtitle???python ?programming language??,?wgcurrevisionid???????????,?wgrevisionid???????????,?wgarticleid???????,?wgisarticle????,?wgisredirect????,?wgaction???view?,?wgusername??null,?wgusergroups???????,?wgcategories????articles with short description?,?use dmy dates from august ?????,?all articles with unsourced statements?,?articles with unsourced statements from dec

In [12]:
valid = string.ascii_lowercase + '.,?'
valid

'abcdefghijklmnopqrstuvwxyz.,?'

In [13]:
print(texts["spanish"][:5000])

????python ? wikipedia, la enciclopedia libre?document.documentelement.classname??client?js??rlconf???wgbreakframes????,?wgseparatortransformtable????,?t.?,???t,??,?wgdigittransformtable?????,???,?wgdefaultdateformat???dmy?,?wgmonthnames?????,?enero?,?febrero?,?marzo?,?abril?,?mayo?,?junio?,?julio?,?agosto?,?septiembre?,?octubre?,?noviembre?,?diciembre??,?wgmonthnamesshort?????,?ene?,?feb?,?mar?,?abr?,?may?,?jun?,?jul?,?ago?,?sep?,?oct?,?nov?,?dic??,?wgrequestid???xkv?sqpaaeyaaehg?zsaaabi?,?wgcspnonce????,?wgcanonicalnamespace????,?wgcanonicalspecialpagename????,?wgnamespacenumber???,?wgpagename???python?,?wgtitle???python?,?wgcurrevisionid???????????,?wgrevisionid???????????,?wgarticleid??????,?wgisarticle????,?wgisredirect????,?wgaction???view?,?wgusername??null,?wgusergroups???????,?wgcategories????wikipedia?art?culos con datos por trasladar a wikidata?,?wikipedia?art?culos destacados en la wikipedia en ruso?,?wikipedia?art?culos buenos en la wikipedia en alem?n?,??wikipedia?art?cul

In [14]:
class LangProfile:
    def __init__(self, name, text):
        self.name = name

        g = dense_graph(valid)
        for i in range(len(text)-1):
            g.inc(text[i], text[i+1])
        self.graph = g.to_prob()

    def prob(self, text):
        p = 1
        for i in range(len(text)-1):
            p *= self.graph.get_edge(text[i], text[i+1])
        return p

In [15]:
english = LangProfile("english", texts["english"])
spanish = LangProfile("spanish", texts["spanish"])
spanish.graph.edges.iloc[:8,:8]

AttributeError: 'BlockManager' object has no attribute 'T'

In [None]:
class dense_graph1:
    def __init__(self, nodes):
        self.nodes = sorted(nodes)
        self.edges = pd.DataFrame(index = self. nodes, columns = self.nodes)
        self.edges = self.edges.fillna(0)
    def get_edge(self, nodeA, nodeB):
        assert nodeA in self.nodes
        assert nodeB in self.nodes
        return self.edges.at[nodeA, nodeB]
    
    def inc(self, nodeA, nodeB):
        assert nodeA in self.nodes
        assert nodeB in self.nodes
        self.edges.at[nodeA, nodeB] += 1
        
    def _repr_html_(self):
        with pd.option_context('display.max_columns', None): # C
            return self.edges._repr_html_()
        
    def to_prob(self):
        sums = self.edges.sum(axis =1)
        self.edges = self.edges.div(sums, axis = 1)
        
dgraph = dense_graph1(valid)
dgraph.inc("a", "b")
dgraph.inc("a", "c")

dgraph.to_prob()
#dgraph

In [None]:
class LangProfile1:
    def __init__(self):
        self.makov = None
        
    def fit(self, text):
        self.markov = dense_graph1(valid)
        for i in range(len(text)-1):
            first =text[i]
            second = text[i+1]
            self.markov.inc(first, second)
        self.markov.to_prob()
            
    def prob(self, text):
        p =1
        for i in range(len(text)-1):
            first =text[i]
            second = text[i+1]
            print(first, second, self.markov.get_edge(first, second))
        p *= self.markov.get_edge(first, second)
        return p

p  = LangProfile1()
p.fit("this is some example text")
p.prob("hellow")

In [None]:
english = LangProfile(english, texts["english"])
spanish = LangProfile(spanish, texts["spanish"])
english.prob("house"), spanish.prob("house")

In [None]:
class lang_predictor:
    def __init__(self, profiles):
        self.profiles = profiles
    def predict(self, text):
        best_profile = max(self.profiles, key = lambda prof: prof.prob(text))
        return best_profile.name
predictior = lang_predictor(profiles)
predictor.predict("hola amigos")