In [14]:
import csv
import re
import pandas as pd
import numpy as np
import random
import glob

from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

* C (.gcc, .c)
* C#
* Common Lisp (.sbcl)
* Clojure
* Haskell
* Java
* JavaScript
* OCaml
* Perl
* PHP (.hack, .php)
* Python
* Ruby (.jruby, .yarv)
* Scala
* Scheme (.racket)

In [15]:
def read_code(directory):
    files = glob.glob('data/{}/*.*'.format(directory))
    sample = []
    for file in files:
        with open(file,) as f:
            sample.append(f.read())
    return sample

In [16]:
glob.glob('data/C/*.*')[:5]

['data/C/binarytrees.gcc',
 'data/C/binarytrees.gcc-2.gcc',
 'data/C/binarytrees.gcc-3.gcc',
 'data/C/binarytrees.gcc-5.gcc',
 'data/C/binarytrees.gcc-7.gcc']

In [17]:
c_sample = read_code('C')
csharp_sample = read_code('C#')
common_lisp_sample = read_code('Common_Lisp')
clojure_sample = read_code('Clojure')
haskell_sample = read_code('Haskell')
java_sample = read_code('Java')
javascript_sample = read_code('JavaScript')
# ocaml_sample = read_code('OCaml')
perl_sample = read_code('Perl')
php_sample = read_code('PHP')
python_sample = read_code('Python')
ruby_sample = read_code('Ruby')
scala_sample = read_code('Scala')
scheme_sample = read_code('Scheme')


In [18]:
classifier = MultinomialNB()
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
vectorizer.fit(c_sample + csharp_sample + common_lisp_sample + clojure_sample + haskell_sample + java_sample + javascript_sample + perl_sample + php_sample + python_sample + ruby_sample + scala_sample + scheme_sample)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
X_train = vectorizer.transform(c_sample + csharp_sample + common_lisp_sample\
                               + clojure_sample + haskell_sample + java_sample\
                               + javascript_sample + perl_sample + php_sample\
                               + python_sample + ruby_sample + scala_sample\
                               + scheme_sample)

In [21]:
y_train = ['c'] * len(c_sample) + ['csharp'] * len(csharp_sample) + ['common_lisp'] \
        * len(common_lisp_sample) + ['clojure'] * len(clojure_sample) + ['haskell'] \
        * len(haskell_sample) + ['java'] * len(java_sample) + ['javascript'] * \
        len(javascript_sample) + ['perl'] * len(perl_sample) + ['php'] * \
        len(php_sample) + ['python'] * len(python_sample) + ['ruby'] * len(ruby_sample) + \
        ['scala'] * len(scala_sample) + ['scheme'] * len(scheme_sample)

In [22]:
X_train

<556x8123 sparse matrix of type '<class 'numpy.int64'>'
	with 64060 stored elements in Compressed Sparse Row format>

In [23]:
len(y_train)

556

In [24]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
classifier.score(X_train, y_train)

0.9730215827338129

In [26]:
with open('test.csv') as f:
        y_open = f.read()

In [27]:
y_open = y_open.split('\n')

In [28]:
y_test = []
for pair in y_open:
    y_test.append(pair.split(','))

In [29]:
y_test = list(pd.DataFrame(y_test).pop(1))[:-1]

In [30]:
X_test = []
files = glob.glob('test/*')
for file in files:
    with open(file) as f:
        X_test.append(f.read())

In [31]:
X_test = vectorizer.transform(X_test)

In [32]:
classifier.fit(X_test, y_test)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
classifier.score(X_test, y_test)

0.875

In [34]:
print(classification_report(classifier.predict(X_test), y_test))

             precision    recall  f1-score   support

    clojure       1.00      0.57      0.73         7
    haskell       1.00      1.00      1.00         3
       java       0.00      0.00      0.00         0
 javascript       1.00      1.00      1.00         4
      ocaml       0.50      0.50      0.50         2
        php       0.67      1.00      0.80         2
     python       1.00      1.00      1.00         4
       ruby       1.00      1.00      1.00         3
      scala       1.00      1.00      1.00         2
     scheme       1.00      1.00      1.00         3
        tcl       1.00      1.00      1.00         2

avg / total       0.95      0.88      0.90        32



  'recall', 'true', average, warn_for)
