# Polyglot with multi file reader

In [1]:
import glob
import re
import csv
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfTransformer as TF
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.pipeline import Pipeline

In [2]:
def read_multiple_files(folder, file_ext):
    files = glob.glob('{}/*.{}'.format(folder, file_ext))
    data = []
    for file in files:
        with open(file, encoding='latin_1') as f:
            data.append(f.read())
    return data

### ====================     TRAIN     ============================

In [3]:
file_exts = {'Ruby': 'jruby*', 'Clojure': 'clojure*', 'Scala': 'scala*', 'Python': 'py*', 
             'OCaml': 'ocaml*', 'Scheme': 'racket*', 'Common Lisp': 'sbcl*', 'Perl': 'perl*', 'PHP': 'hack*', 
             'C#': 'csharp*', 'Java': 'java', 'JavaScript': 'javascript*', 'C': 'gcc*', 'Haskell': 'ghc*'}
f_e_again = {'PHP': 'php*', 'Ruby': 'yarv*'}

In [4]:
X_language_train = []
y_language_train = []

X_language_test = []
y_language_test = []

In [5]:
for lang, ext in file_exts.items():
    get_data = read_multiple_files('bench',ext)
    X_language_train += get_data
    y_language_train += ([lang]*len(get_data))

    if lang == 'PHP':
        get_data = read_multiple_files('bench','php*')
        X_language_train += get_data
        y_language_train += ([lang]*len(get_data))     #can't use .append

    if lang == 'Ruby':
        get_data = read_multiple_files('bench','yarv*')
        X_language_train += get_data
        y_language_train += ([lang]*len(get_data))


In [6]:
# X_language_train

In [7]:
# y_language_train

In [8]:
py_pipeline = Pipeline([("count", CV()), ("multi", MNB())])

In [9]:
# ("tfid", TF()),

In [10]:
py_pipeline.fit(X_language_train, y_language_train)


Pipeline(steps=[('count', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multi', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [11]:
py_pipeline.score(X_language_train,y_language_train)

0.99206349206349209

### ====================     TEST    ============================

In [12]:
for lang, ext in file_exts.items():
    get_data = read_multiple_files('bench2',ext)
    X_language_test += get_data
    y_language_test += ([lang]*len(get_data))

    if lang == 'PHP':
        get_data = read_multiple_files('bench2','php*')
        X_language_test += get_data
        y_language_test += ([lang]*len(get_data))     #can't use .append bc pipelines are fucking stupid

    if lang == 'Ruby':
        get_data = read_multiple_files('bench2','yarv*')
        X_language_test += get_data
        y_language_test += ([lang]*len(get_data))


In [13]:
py_pipeline.score(X_language_test,y_language_test)

0.94894894894894899

## TEST A SINGLE PROGRAM  ----------------------------

In [14]:
def test_one_program(file):
    data = []
    with open(file, encoding='latin_1') as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(f.read())
    return data

In [15]:
X_language_test = test_one_program('bench2/mandelbrot.clojure-5.clojure')
# X_language_test

In [16]:
py_pipeline.predict(X_language_test)   # this is a real Clojure program

array(['Clojure'], 
      dtype='<U11')

In [17]:
X_single_language_test = ''

for lang, ext in file_exts.items():
    get_data = read_multiple_files('test_one_each',ext)
    X_single_language_test = get_data

    if lang == 'PHP':
        print(lang,py_pipeline.predict(X_single_language_test))
        get_data = read_multiple_files('test_one_each','php*')
        X_single_language_test = get_data

    if lang == 'Ruby':
        print(lang,py_pipeline.predict(X_single_language_test))
        get_data = read_multiple_files('test_one_each','yarv*')
        X_single_language_test = get_data

    print(lang,py_pipeline.predict(X_single_language_test))

Scala ['Scala']
Python ['Clojure']
Clojure ['Clojure']
Scheme ['Scheme']
C# ['C#']
Common Lisp ['Common Lisp']
PHP ['PHP']
PHP ['PHP']
Java ['Java']
Ruby ['Ruby']
Ruby ['Ruby']
JavaScript ['JavaScript']
Perl ['Perl']
OCaml ['OCaml']
C ['C']
Haskell ['Haskell']


In [23]:
## I picked one random program from each language - all were BIG programs

##    TEST SINGLE LINES OF CODE   -----------------------

#### run 1 single line of code thru 

In [18]:
# X_line_of_language_test = "mult3 = filter(lambda x: x % 3 == 0)"
X_line_of_language_test = ["v2df Civ = { y*inverse_h-1.0, y*inverse_h-1.0 };"]
prob = np.mean(py_pipeline.predict_proba(X_line_of_language_test))
print(prob)

0.0714285714286


####  run thousands of lines thru

In [19]:
def read_multiple_files_but_single_lines(folder, file_ext):
    prob = 0
    num_correct = 0
    total_correct = 0
    files = glob.glob('{}/*.{}'.format(folder, file_ext))
    data = []
    for file in files:
        with open(file, encoding='latin_1') as f:
            reader = csv.reader(f, delimiter = '\n')
        
            for row in reader:
                try:
                    prob = np.mean(py_pipeline.predict_proba(row))
                except:
                    continue
#                 print(file_ext,":  ",py_pipeline.predict(row), prob)
                guess = str("".join(py_pipeline.predict(row)))

                if file_ext == file_exts[guess]:
                    num_correct += 1
                total_correct += 1
                
    return num_correct / total_correct

In [20]:
for lang, ext in file_exts.items():
    avg = read_multiple_files_but_single_lines('bench2', ext)
    print(lang, "Percentage of correct guesses:         ", avg)

Scala Percentage of correct guesses:          0.2685442284807616
Python Percentage of correct guesses:          0.147117296222664
Clojure Percentage of correct guesses:          0.42528735632183906
Scheme Percentage of correct guesses:          0.4810855263157895
C# Percentage of correct guesses:          0.2343954604975993
Common Lisp Percentage of correct guesses:          0.5647216633132126
PHP Percentage of correct guesses:          0.25161812297734626
Java Percentage of correct guesses:          0.31076312307013676
Ruby Percentage of correct guesses:          0.641319285387082
JavaScript Percentage of correct guesses:          0.06523534269199009
Perl Percentage of correct guesses:          0.34307692307692306
OCaml Percentage of correct guesses:          0.44659685863874343
C Percentage of correct guesses:          0.29395667046750285
Haskell Percentage of correct guesses:          0.3546187228766274


## correct around 32%

## guessing programs is much easier than just lines, for sure.

In [21]:
# ===========  get percetnages for programs...as opposed to just lines
def read_multiple_files_b(folder, file_ext):
    prob = 0
    num_correct = 0
    total_correct = 0
    files = glob.glob('{}/*.{}'.format(folder, file_ext))
    data = []
    for file in files:
        with open(file, encoding='latin_1') as f:
            data.append(f.read())
            prob = np.mean(py_pipeline.predict_proba(data))
            guess = str("".join(py_pipeline.predict(data)))
            if file_ext == file_exts[guess]:
                num_correct += 1
            total_correct += 1
            data = []
    return num_correct / total_correct

In [22]:
for lang, ext in file_exts.items():
    avg = read_multiple_files_b('bench2', ext)
    print(lang, ":   Percentage of correct guesses:               {:.4f}".format(avg))

Scala :   Percentage of correct guesses:               1.0000
Python :   Percentage of correct guesses:               0.8750
Clojure :   Percentage of correct guesses:               1.0000
Scheme :   Percentage of correct guesses:               1.0000
C# :   Percentage of correct guesses:               1.0000
Common Lisp :   Percentage of correct guesses:               1.0000
PHP :   Percentage of correct guesses:               0.8667
Java :   Percentage of correct guesses:               1.0000
Ruby :   Percentage of correct guesses:               0.9524
JavaScript :   Percentage of correct guesses:               0.6429
Perl :   Percentage of correct guesses:               0.8947
OCaml :   Percentage of correct guesses:               1.0000
C :   Percentage of correct guesses:               0.9697
Haskell :   Percentage of correct guesses:               1.0000


## much better

## correct around 87 %