In [1]:
import csv
import re
import pandas as pd
import numpy as np
import random
import glob

from collections import Counter, defaultdict, namedtuple
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

* C (.gcc, .c)
* C#
* Common Lisp (.sbcl)
* Clojure
* Haskell
* Java
* JavaScript
* OCaml
* Perl
* PHP (.hack, .php)
* Python
* Ruby (.jruby, .yarv)
* Scala
* Scheme (.racket)

In [2]:
def read_code(directory):
    files = glob.glob('data/{}/*.*'.format(directory))
    sample = []
    for file in files:
        with open(file) as f:
            sample.append(f.read())
    return sample

In [3]:
glob.glob('data/C/*.*')[:5]

['data/C/binarytrees.gcc',
 'data/C/binarytrees.gcc-2.gcc',
 'data/C/binarytrees.gcc-3.gcc',
 'data/C/binarytrees.gcc-5.gcc',
 'data/C/binarytrees.gcc-7.gcc']

In [4]:
c_sample = read_code('C')
csharp_sample = read_code('C#')
common_lisp_sample = read_code('Common_Lisp')
clojure_sample = read_code('Clojure')
haskell_sample = read_code('Haskell')
java_sample = read_code('Java')
javascript_sample = read_code('JavaScript')
ocaml_sample = read_code('OCaml')
perl_sample = read_code('Perl')
php_sample = read_code('PHP')
python_sample = read_code('Python')
ruby_sample = read_code('Ruby')
scala_sample = read_code('Scala')
scheme_sample = read_code('Scheme')


In [5]:
classifier = MultinomialNB()
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
vectorizer.fit(c_sample + csharp_sample + common_lisp_sample + clojure_sample + haskell_sample + java_sample + javascript_sample + ocaml_sample + perl_sample + php_sample + python_sample + ruby_sample + scala_sample + scheme_sample)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
X_train = vectorizer.transform(c_sample + csharp_sample + common_lisp_sample\
                               + clojure_sample + haskell_sample + java_sample\
                               + javascript_sample + ocaml_sample + perl_sample + php_sample\
                               + python_sample + ruby_sample + scala_sample\
                               + scheme_sample)

In [8]:
y_train = ['c'] * len(c_sample) + ['csharp'] * len(csharp_sample) + ['common_lisp'] \
        * len(common_lisp_sample) + ['clojure'] * len(clojure_sample) + ['haskell'] \
        * len(haskell_sample) + ['java'] * len(java_sample) + ['javascript'] * \
        len(javascript_sample) + ['ocaml'] * len(ocaml_sample) + \
        ['perl'] * len(perl_sample) + ['php'] * len(php_sample) + ['python']\
        * len(python_sample) + ['ruby'] * len(ruby_sample) + ['scala'] * len(scala_sample)\
        + ['scheme'] * len(scheme_sample)

In [9]:
# X_train

In [10]:
len(y_train)

587

In [11]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
classifier.score(X_train, y_train)

0.97444633730834751

In [13]:
print(classification_report(classifier.predict(X_train), y_train))

             precision    recall  f1-score   support

          c       1.00      0.98      0.99        59
    clojure       1.00      1.00      1.00        38
common_lisp       1.00      1.00      1.00        34
     csharp       0.98      0.98      0.98        41
    haskell       1.00      1.00      1.00        33
       java       0.98      0.91      0.94        55
 javascript       0.80      0.80      0.80        30
      ocaml       1.00      1.00      1.00        31
       perl       1.00      1.00      1.00        34
        php       0.95      0.96      0.95        55
     python       0.97      1.00      0.99        35
       ruby       0.97      1.00      0.99        71
      scala       0.98      1.00      0.99        42
     scheme       1.00      1.00      1.00        29

avg / total       0.97      0.97      0.97       587



In [14]:
len(vectorizer.get_feature_names())

8450

In [15]:
vectorizer.get_feature_names()[5000:5020]

['mutable',
 'mutablelist',
 'mutant',
 'mutates',
 'mutation',
 'mutex',
 'mux',
 'mv',
 'mvar',
 'mvbv',
 'mvbvs',
 'mvv',
 'mvvs',
 'mx',
 'my',
 'my_id',
 'my_lock_acquire',
 'my_param',
 'mykola',
 'myname']

### Testing

Testing using manual input

In [16]:
message_python = ['''from pkgutil import iter_modules
from subprocess import call

dependencies = {
    "Crypto": "crypto",
    "dpkt": "dpkt",
    "IPy": "ipy",
    "pcap": "pypcap"
}

installed, missing_pkgs = [pkg[1] for pkg in iter_modules()], []

for module, pkg in dependencies.items():
    if module not in installed:
        print("dshell requires {}".format(module))
        missing_pkgs.append("python-{}".format(pkg))
    else:
        print("{} is installed".format(module))

if missing_pkgs:
    cmd = ["sudo", "apt-get", "install"] + missing_pkgs

    print(" ".join(cmd))
    call(cmd)

call(["make", "all"])''']

In [17]:
vectorizer.transform(message_python)
classifier.predict(vectorizer.transform(message_python))

array(['python'], 
      dtype='<U11')

In [18]:
message_clojure = ['''defn cf-settings
  "Setup settings for campfire. Required information is your api-token, ssl connection
  true or false, and your campfire sub-domain."
  [token ssl sub-domain]
  {:api-token token, :ssl ssl, :sub-domain sub-domain})

(defn room
  "Sets up the room to send events too. Pass in the settings created with cf-settings
  and the room name"
  [settings room-name]
  (cf/room-by-name settings room-name))

(defn campfire_message
  "Formats an event into a string"
  [e]
  (str (join " " ["Riemann alert on" (str (:host e)) "-" (str (:service e)) "is" (upper-case (str (:state e))) "- Description:" (str (:description e))])))

(defn campfire
  "Creates an adaptor to forward events to campfire. The campfire event will
  contain the host, state, service, metric and description.
  Tested with:
  (streams
    (by [:host, :service]
      (let [camp (campfire \"token\", true, \"sub-domain\", \"room\")]
        camp)))"
  [token ssl sub-domain room-name]
  (fn [e]
    (let [message_string (campfire_message e)
          settings (cf-settings token ssl sub-domain)]
      (cf/message (room settings room-name) message_string))))''']

In [19]:
vectorizer.transform(message_clojure)
classifier.predict(vectorizer.transform(message_clojure))

array(['clojure'], 
      dtype='<U11')

In [20]:
message_js = ['''function errorHandler(context) {
  return function(error) {
    trace('Failure in ' + context + ': ' + error.toString);
  }
}

function successHandler(context) {
  return function() {
    trace('Success in ' + context);
  }
}

function noAction() {
}


function VideoPipe(stream, handler) {
  var servers = null;
  var pc1 = new RTCPeerConnection(servers);
  var pc2 = new RTCPeerConnection(servers);

  pc1.addStream(stream);
  pc1.onicecandidate = function(event) {
    if (event.candidate) {
      pc2.addIceCandidate(new RTCIceCandidate(event.candidate),
                          noAction, errorHandler('AddIceCandidate'));
    }
  }
  pc2.onicecandidate = function(event) {
    if (event.candidate) {
      pc1.addIceCandidate(new RTCIceCandidate(event.candidate),
                          noAction, errorHandler('AddIceCandidate'));
    }
  }
  pc2.onaddstream = function(e) {
    handler(e.stream);
  }
  pc1.createOffer(function(desc) {
    pc1.setLocalDescription(desc);
    pc2.setRemoteDescription(desc);
    pc2.createAnswer(function(desc2) {
      pc2.setLocalDescription(desc2);
      pc1.setRemoteDescription(desc2);
    }, errorHandler('pc2.createAnswer'));
  }, errorHandler('pc1.createOffer'));
  this.pc1 = pc1;
  this.pc2 = pc2;
}

VideoPipe.prototype.close = function() {
  this.pc1.close();
  this.pc2.close();
}''']

In [21]:
vectorizer.transform(message_js)
classifier.predict(vectorizer.transform(message_js))

array(['javascript'], 
      dtype='<U11')

In [22]:
message_ruby = ['''module ActiveJob
  module Core
    extend ActiveSupport::Concern

    included do
      # Job arguments
      attr_accessor :arguments
      attr_writer :serialized_arguments

      # Timestamp when the job should be performed
      attr_accessor :scheduled_at

      # Job Identifier
      attr_accessor :job_id

      # Queue in which the job will reside.
      attr_writer :queue_name
    end

    # These methods will be included into any Active Job object, adding
    # helpers for de/serialization and creation of job instances.
    module ClassMethods
      # Creates a new job instance from a hash created with +serialize+
      def deserialize(job_data)
        job = job_data['job_class'].constantize.new
        job.deserialize(job_data)
        job
      end

      # Creates a job preconfigured with the given options. You can call
      # perform_later with the job arguments to enqueue the job with the
      # preconfigured options
      #
      # ==== Options
      # * <tt>:wait</tt> - Enqueues the job with the specified delay
      # * <tt>:wait_until</tt> - Enqueues the job at the time specified
      # * <tt>:queue</tt> - Enqueues the job on the specified queue
      #
      # ==== Examples
      #
      #    VideoJob.set(queue: :some_queue).perform_later(Video.last)
      #    VideoJob.set(wait: 5.minutes).perform_later(Video.last)
      #    VideoJob.set(wait_until: Time.now.tomorrow).perform_later(Video.last)
      #    VideoJob.set(queue: :some_queue, wait: 5.minutes).perform_later(Video.last)
      #    VideoJob.set(queue: :some_queue, wait_until: Time.now.tomorrow).perform_later(Video.last)
      def set(options={})
        ConfiguredJob.new(self, options)
      end
    end

    # Creates a new job instance. Takes the arguments that will be
    # passed to the perform method.
    def initialize(*arguments)
      @arguments  = arguments
      @job_id     = SecureRandom.uuid
      @queue_name = self.class.queue_name
    end

    # Returns a hash with the job data that can safely be passed to the
    # queueing adapter.
    def serialize
      {
        'job_class'  => self.class.name,
        'job_id'     => job_id,
        'queue_name' => queue_name,
        'arguments'  => serialize_arguments(arguments)
      }
    end

    # Attaches the stored job data to the current instance. Receives a hash
    # returned from +serialize+
    #
    # ==== Examples
    #
    #    class DeliverWebhookJob < ActiveJob::Base
    #      def serialize
    #        super.merge('attempt_number' => (@attempt_number || 0) + 1)
    #      end
    #
    #      def deserialize(job_data)
    #        super
    #        @attempt_number = job_data['attempt_number']
    #      end
    #
    #      rescue_from(TimeoutError) do |exception|
    #        raise exception if @attempt_number > 5
    #        retry_job(wait: 10)
    #      end
    #    end
    def deserialize(job_data)
      self.job_id               = job_data['job_id']
      self.queue_name           = job_data['queue_name']
      self.serialized_arguments = job_data['arguments']
    end

    private
      def deserialize_arguments_if_needed
        if defined?(@serialized_arguments) && @serialized_arguments.present?
          @arguments = deserialize_arguments(@serialized_arguments)
          @serialized_arguments = nil
        end
      end

      def serialize_arguments(serialized_args)
        Arguments.serialize(serialized_args)
      end

      def deserialize_arguments(serialized_args)
        Arguments.deserialize(serialized_args)
      end
  end
end''']

In [23]:
vectorizer.transform(message_ruby)
classifier.predict(vectorizer.transform(message_ruby))

array(['ruby'], 
      dtype='<U11')

Testing using the test data

In [24]:
with open('test.csv') as f:
        y_open = f.read()

In [25]:
y_open = y_open.split('\n')

In [26]:
y_test = []
for x in y_open:
    y_test.append(x.split(','))

In [27]:
y_test = list(pd.DataFrame(y_test).pop(1))[:-1]

In [28]:
def read_test():
    X_data = []
    for x in range(1, 33):
         with open('test/{}'.format(x)) as f:
            X_data.append((f.read(), x))
    return X_data

In [29]:
X_data = [x[0] for x in read_test()]

In [30]:
len(X_data)

32

In [31]:
X_test = vectorizer.transform(X_data)

In [32]:
X_test

<32x8450 sparse matrix of type '<class 'numpy.int64'>'
	with 1701 stored elements in Compressed Sparse Row format>

In [33]:
# classifier.fit(X_test, y_test)

In [34]:
classifier.score(X_test, y_test)

0.65625

In [35]:
print(classification_report(classifier.predict(X_test), y_test))

             precision    recall  f1-score   support

          c       0.00      0.00      0.00         2
    clojure       1.00      0.50      0.67         8
    haskell       0.67      1.00      0.80         2
       java       0.50      1.00      0.67         1
 javascript       0.50      0.67      0.57         3
      ocaml       1.00      1.00      1.00         2
        php       0.33      0.50      0.40         2
     python       0.50      0.67      0.57         3
       ruby       0.67      0.67      0.67         3
      scala       1.00      1.00      1.00         2
     scheme       1.00      0.75      0.86         4
        tcl       0.00      0.00      0.00         0

avg / total       0.73      0.66      0.66        32



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
