In [1]:
import csv
import re
import pandas as pd
import numpy as np
import random
import glob

from collections import Counter, defaultdict, namedtuple
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

from programming_language_classifier import *

In [2]:
vectorizer = CountVectorizer()

In [3]:
def read_code(directory):
    files = glob.glob('data/{}/*.*'.format(directory))
    sample = []
    for file in files:
        with open(file) as f:
            sample.append(f.read())
    return sample

In [4]:
glob.glob('data/C/*.*')[:5]

['data/C/binarytrees.gcc',
 'data/C/binarytrees.gcc-2.gcc',
 'data/C/binarytrees.gcc-3.gcc',
 'data/C/binarytrees.gcc-5.gcc',
 'data/C/binarytrees.gcc-7.gcc']

In [5]:
c_sample = read_code('C')
csharp_sample = read_code('C#')
common_lisp_sample = read_code('Common_Lisp')
clojure_sample = read_code('Clojure')
haskell_sample = read_code('Haskell')
java_sample = read_code('Java')
javascript_sample = read_code('JavaScript')
# ocaml_sample = read_code('OCaml')
perl_sample = read_code('Perl')
php_sample = read_code('PHP')
python_sample = read_code('Python')
ruby_sample = read_code('Ruby')
scala_sample = read_code('Scala')
scheme_sample = read_code('Scheme')

In [6]:
X_train = (c_sample + csharp_sample + common_lisp_sample + clojure_sample \
           + haskell_sample + java_sample + javascript_sample + perl_sample + php_sample \
           + python_sample + ruby_sample + scala_sample + scheme_sample)

In [7]:
y_train = ['c'] * len(c_sample) + ['csharp'] * len(csharp_sample) + ['common_lisp'] \
        * len(common_lisp_sample) + ['clojure'] * len(clojure_sample) + ['haskell'] \
        * len(haskell_sample) + ['java'] * len(java_sample) + ['javascript'] * \
        len(javascript_sample) + ['perl'] * len(perl_sample) + ['php'] * \
        len(php_sample) + ['python'] * len(python_sample) + ['ruby'] * len(ruby_sample) + \
        ['scala'] * len(scala_sample) + ['scheme'] * len(scheme_sample)

In [8]:
with open('test.csv') as f:
        y_data = f.read()

In [9]:
y_data = y_data.split('\n')

In [10]:
y_test = []
for x in y_data:
    y_test.append(x.split(','))

In [11]:
y_test = list(pd.DataFrame(y_test).pop(1))[:-1]

In [12]:
def read_test():
    X_data = []
    for x in range(1, 33):
         with open('test/{}'.format(x)) as f:
            X_data.append((f.read(), x))
    return X_data

In [13]:
X_test = [x[0] for x in read_test()]

In [14]:
class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        feature_vectors = []
        for x in X:
            feature_vector = [f(x) for f in self.featurizers]
            feature_vectors.append(feature_vector)
        
        return np.array(feature_vectors)

In [15]:
from textblob import TextBlob

from collections import Counter


class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors

In [17]:
lang_featurizer = make_union(
    BagOfWordsFeaturizer(100), 
    FunctionFeaturizer(percentage_of_parenthesis, percentage_of_punctuation,
                       percentage_of_colon, percentage_of_def, percentage_of_at,
                       percentage_of_star, percent_start_end_parenthesis, percentage_end_sc,
                       word_function, word_nil, word_null, word_var, percentage_ds, word_elsif,
                       word_tl, word_bool, hashes, word_final, percentage
                      ))


In [18]:
pipe = make_pipeline(lang_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

1.0

Testing

In [19]:
pipe.score(X_test, y_test)

0.53125

In [20]:
print(classification_report(pipe.predict(X_test), y_test))

             precision    recall  f1-score   support

          c       0.00      0.00      0.00         1
    clojure       0.25      0.50      0.33         2
     csharp       0.00      0.00      0.00         3
    haskell       0.33      1.00      0.50         1
       java       0.00      0.00      0.00         0
 javascript       0.75      0.75      0.75         4
      ocaml       0.00      0.00      0.00         0
        php       1.00      0.60      0.75         5
     python       0.50      1.00      0.67         2
       ruby       1.00      0.30      0.46        10
      scala       1.00      1.00      1.00         2
     scheme       0.67      1.00      0.80         2
        tcl       0.00      0.00      0.00         0

avg / total       0.72      0.53      0.55        32



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [21]:
message_python = ['''from pkgutil import iter_modules
from subprocess import call

dependencies = {
    "Crypto": "crypto",
    "dpkt": "dpkt",
    "IPy": "ipy",
    "pcap": "pypcap"
}

installed, missing_pkgs = [pkg[1] for pkg in iter_modules()], []

for module, pkg in dependencies.items():
    if module not in installed:
        print("dshell requires {}".format(module))
        missing_pkgs.append("python-{}".format(pkg))
    else:
        print("{} is installed".format(module))

if missing_pkgs:
    cmd = ["sudo", "apt-get", "install"] + missing_pkgs

    print(" ".join(cmd))
    call(cmd)

call(["make", "all"])''']

In [22]:
pipe.predict(message_python)

array(['python'], 
      dtype='<U11')

In [23]:
message_clojure = ['''(ns my-cli.core)

(defn -main [& args]
  (println "My CLI received arguments:" args))

(defn add-main [& args]
  (->> (map #(Integer/parseInt %) args)
       (reduce + 0)
       (println "The sum is:")))''']

In [24]:
pipe.predict(message_clojure)

array(['clojure'], 
      dtype='<U11')

In [25]:
message_js = ['''function errorHandler(context) {
  return function(error) {
    trace('Failure in ' + context + ': ' + error.toString);
  }
}

function successHandler(context) {
  return function() {
    trace('Success in ' + context);
  }
}

function noAction() {
}


function VideoPipe(stream, handler) {
  var servers = null;
  var pc1 = new RTCPeerConnection(servers);
  var pc2 = new RTCPeerConnection(servers);

  pc1.addStream(stream);
  pc1.onicecandidate = function(event) {
    if (event.candidate) {
      pc2.addIceCandidate(new RTCIceCandidate(event.candidate),
                          noAction, errorHandler('AddIceCandidate'));
    }
  }
  pc2.onicecandidate = function(event) {
    if (event.candidate) {
      pc1.addIceCandidate(new RTCIceCandidate(event.candidate),
                          noAction, errorHandler('AddIceCandidate'));
    }
  }
  pc2.onaddstream = function(e) {
    handler(e.stream);
  }
  pc1.createOffer(function(desc) {
    pc1.setLocalDescription(desc);
    pc2.setRemoteDescription(desc);
    pc2.createAnswer(function(desc2) {
      pc2.setLocalDescription(desc2);
      pc1.setRemoteDescription(desc2);
    }, errorHandler('pc2.createAnswer'));
  }, errorHandler('pc1.createOffer'));
  this.pc1 = pc1;
  this.pc2 = pc2;
}

VideoPipe.prototype.close = function() {
  this.pc1.close();
  this.pc2.close();
}''']

In [26]:
pipe.predict(message_js)

array(['javascript'], 
      dtype='<U11')

In [27]:
message_ruby = ['''module ActiveJob
  module Core
    extend ActiveSupport::Concern

    included do
      # Job arguments
      attr_accessor :arguments
      attr_writer :serialized_arguments

      # Timestamp when the job should be performed
      attr_accessor :scheduled_at

      # Job Identifier
      attr_accessor :job_id

      # Queue in which the job will reside.
      attr_writer :queue_name
    end

    # These methods will be included into any Active Job object, adding
    # helpers for de/serialization and creation of job instances.
    module ClassMethods
      # Creates a new job instance from a hash created with +serialize+
      def deserialize(job_data)
        job = job_data['job_class'].constantize.new
        job.deserialize(job_data)
        job
      end

      # Creates a job preconfigured with the given options. You can call
      # perform_later with the job arguments to enqueue the job with the
      # preconfigured options
      #
      # ==== Options
      # * <tt>:wait</tt> - Enqueues the job with the specified delay
      # * <tt>:wait_until</tt> - Enqueues the job at the time specified
      # * <tt>:queue</tt> - Enqueues the job on the specified queue
      #
      # ==== Examples
      #
      #    VideoJob.set(queue: :some_queue).perform_later(Video.last)
      #    VideoJob.set(wait: 5.minutes).perform_later(Video.last)
      #    VideoJob.set(wait_until: Time.now.tomorrow).perform_later(Video.last)
      #    VideoJob.set(queue: :some_queue, wait: 5.minutes).perform_later(Video.last)
      #    VideoJob.set(queue: :some_queue, wait_until: Time.now.tomorrow).perform_later(Video.last)
      def set(options={})
        ConfiguredJob.new(self, options)
      end
    end

    # Creates a new job instance. Takes the arguments that will be
    # passed to the perform method.
    def initialize(*arguments)
      @arguments  = arguments
      @job_id     = SecureRandom.uuid
      @queue_name = self.class.queue_name
    end

    # Returns a hash with the job data that can safely be passed to the
    # queueing adapter.
    def serialize
      {
        'job_class'  => self.class.name,
        'job_id'     => job_id,
        'queue_name' => queue_name,
        'arguments'  => serialize_arguments(arguments)
      }
    end

    # Attaches the stored job data to the current instance. Receives a hash
    # returned from +serialize+
    #
    # ==== Examples
    #
    #    class DeliverWebhookJob < ActiveJob::Base
    #      def serialize
    #        super.merge('attempt_number' => (@attempt_number || 0) + 1)
    #      end
    #
    #      def deserialize(job_data)
    #        super
    #        @attempt_number = job_data['attempt_number']
    #      end
    #
    #      rescue_from(TimeoutError) do |exception|
    #        raise exception if @attempt_number > 5
    #        retry_job(wait: 10)
    #      end
    #    end
    def deserialize(job_data)
      self.job_id               = job_data['job_id']
      self.queue_name           = job_data['queue_name']
      self.serialized_arguments = job_data['arguments']
    end

    private
      def deserialize_arguments_if_needed
        if defined?(@serialized_arguments) && @serialized_arguments.present?
          @arguments = deserialize_arguments(@serialized_arguments)
          @serialized_arguments = nil
        end
      end

      def serialize_arguments(serialized_args)
        Arguments.serialize(serialized_args)
      end

      def deserialize_arguments(serialized_args)
        Arguments.deserialize(serialized_args)
      end
  end
end''']

In [28]:
pipe.predict(message_ruby)

array(['ruby'], 
      dtype='<U11')