In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import copy, string, os

In [3]:
class dense_graph:
    def __init__(self, nodes):
        self.nodes = sorted(nodes)
        self.nodes.append("?") # A
        self.node_set = set(nodes)
        self.edges = pd.DataFrame(index=self.nodes, columns=self.nodes)
        self.edges.fillna(1, inplace=True) # B

    def _repr_html_(self):
        with pd.option_context('display.max_columns', None): # C
            return self.edges._repr_html_()

    def inc(self, src, dst):
        if not src in self.node_set:
            src = "?"
        if not dst in self.node_set:
            dst = "?"
        self.edges.at[src, dst] += 1
        
    def get_edge(self, src, dst):
        if not src in self.node_set:
            src = "?"
        if not dst in self.node_set:
            dst = "?"
        return self.edges.at[src, dst] # D
        
    def to_prob(self):
        # E
        g = copy.deepcopy(self)
        row_sums = g.edges.sum(axis=1)
        g.edges = g.edges.div(row_sums, axis=0)
        return g

g = dense_graph("ABC") # F

In [4]:
g.inc("A", "B")
g.inc("A", "B")
g.inc("A", "B")
g.inc("A", "C")
g.inc("B", "D")
g.inc("E", "A")
g

Unnamed: 0,A,B,C,?
A,1,4,2,1
B,1,1,1,2
C,1,1,1,1
?,2,1,1,1


In [5]:
probs = g.to_prob()
probs

Unnamed: 0,A,B,C,?
A,0.125,0.5,0.25,0.125
B,0.2,0.2,0.2,0.4
C,0.25,0.25,0.25,0.25
?,0.4,0.2,0.2,0.2


In [6]:
probs.get_edge("A", "B")

0.5

In [7]:
urls = {
    "english": "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "spanish": "https://es.wikipedia.org/wiki/Python",
    "german": "https://de.wikipedia.org/wiki/Python_(Programmiersprache)",
    "french": "https://fr.wikipedia.org/wiki/Python_(langage)",
    "italian": "https://it.wikipedia.org/wiki/Python",
    "english-test": "https://en.wikipedia.org/wiki/Giant_panda",
    "spanish-test": "https://es.wikipedia.org/wiki/Ailuropoda_melanoleuca",
}

texts = {}

for lang, url in urls.items():
    path = lang + ".txt"
    
    # have we downloaded it before?
    if not os.path.exists(path):
        r = requests.get(url)
        r.raise_for_status()
        page = BeautifulSoup(r.text)
        with open(path, "w") as f:
            f.write(page.get_text())
    
    # for simplicity, strip out everything except lower
    # case English letters, periods, and commas
    with open(path) as f:
        valid = string.ascii_lowercase + " .,"
        text = []
        for c in f.read().lower():
            if c in valid:
                text.append(c)
            else:
                text.append("?")
        texts[lang] = "".join(text)

In [8]:
print(texts["english"][:5000])

????python ?programming language? ? wikipedia?document.documentelement.classname??client?js??rlconf???wgbreakframes????,?wgseparatortransformtable?????,???,?wgdigittransformtable?????,???,?wgdefaultdateformat???dmy?,?wgmonthnames?????,?january?,?february?,?march?,?april?,?may?,?june?,?july?,?august?,?september?,?october?,?november?,?december??,?wgmonthnamesshort?????,?jan?,?feb?,?mar?,?apr?,?may?,?jun?,?jul?,?aug?,?sep?,?oct?,?nov?,?dec??,?wgrequestid???xk??xgpaaeaaacb??n?aaadj?,?wgcspnonce????,?wgcanonicalnamespace????,?wgcanonicalspecialpagename????,?wgnamespacenumber???,?wgpagename???python??programming?language??,?wgtitle???python ?programming language??,?wgcurrevisionid???????????,?wgrevisionid???????????,?wgarticleid???????,?wgisarticle????,?wgisredirect????,?wgaction???view?,?wgusername??null,?wgusergroups???????,?wgcategories????articles with short description?,?use dmy dates from august ?????,?all articles with unsourced statements?,?articles with unsourced statements from dec

In [9]:
print(texts["spanish"][:5000])

????python ? wikipedia, la enciclopedia libre?document.documentelement.classname??client?js??rlconf???wgbreakframes????,?wgseparatortransformtable????,?t.?,???t,??,?wgdigittransformtable?????,???,?wgdefaultdateformat???dmy?,?wgmonthnames?????,?enero?,?febrero?,?marzo?,?abril?,?mayo?,?junio?,?julio?,?agosto?,?septiembre?,?octubre?,?noviembre?,?diciembre??,?wgmonthnamesshort?????,?ene?,?feb?,?mar?,?abr?,?may?,?jun?,?jul?,?ago?,?sep?,?oct?,?nov?,?dic??,?wgrequestid???xkv?sqpaaeyaaehg?zsaaabi?,?wgcspnonce????,?wgcanonicalnamespace????,?wgcanonicalspecialpagename????,?wgnamespacenumber???,?wgpagename???python?,?wgtitle???python?,?wgcurrevisionid???????????,?wgrevisionid???????????,?wgarticleid??????,?wgisarticle????,?wgisredirect????,?wgaction???view?,?wgusername??null,?wgusergroups???????,?wgcategories????wikipedia?art?culos con datos por trasladar a wikidata?,?wikipedia?art?culos destacados en la wikipedia en ruso?,?wikipedia?art?culos buenos en la wikipedia en alem?n?,??wikipedia?art?cul

In [10]:
class LangProfile:
    def __init__(self, name, text):
        self.name = name

        g = dense_graph(valid)
        for i in range(len(text)-1):
            g.inc(text[i], text[i+1])
        self.graph = g.to_prob()

    def prob(self, text):
        p = 1
        for i in range(len(text)-1):
            p *= self.graph.get_edge(text[i], text[i+1])
        return p

In [11]:
english = LangProfile("english", texts["english"])
spanish = LangProfile("spanish", texts["spanish"])
spanish.graph.edges.iloc[:8,:8]

Unnamed: 0,Unnamed: 1,",",.,a,b,c,d,e
,0.060115,0.000303,0.003937,0.048455,0.016051,0.067686,0.103725,0.095548
",",0.634473,0.001486,0.001486,0.004458,0.004458,0.001486,0.001486,0.001486
.,0.2301,0.002488,0.130597,0.007463,0.002488,0.029851,0.00995,0.007463
a,0.213549,0.014289,0.010849,0.002382,0.030431,0.06245,0.081503,0.002117
b,0.017575,0.00703,0.010545,0.096661,0.001757,0.035149,0.024605,0.050967
c,0.007567,0.00291,0.004075,0.134459,0.000582,0.041327,0.003492,0.07858
d,0.02314,0.002755,0.001653,0.120661,0.002755,0.001102,0.001653,0.423691
e,0.226351,0.008559,0.009685,0.022523,0.006532,0.03536,0.046396,0.004505


In [12]:
print("English O/A ending:", english.prob("o "), english.prob("a "))
print("Spanish O/A ending:", spanish.prob("o "), spanish.prob("a "))

English O/A ending: 0.058126536999776435 0.05953878406708595
Spanish O/A ending: 0.19805081796032023 0.21354855781952897


In [13]:
english.prob("house"), spanish.prob("house")

(0.000194984372424379, 3.959762532559481e-05)

In [14]:
english.prob("casa"), spanish.prob("casa")

(9.689361588272094e-05, 0.0003442342641258013)

In [15]:
print(english.prob("this is an example of a sentence in english, can we detect that?"))
print(spanish.prob("this is an example of a sentence in english, can we detect that?"))

7.970516733754027e-67
7.35821600875311e-72


In [16]:
long_str = "this is a sentence. " * 20
print(long_str)
print(english.prob(long_str))
print(spanish.prob(long_str))

this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. this is a sentence. 
0.0
0.0
