In [0]:
# import necessary packages
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder

import re
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, classification_report, accuracy_score
from sklearn.pipeline import FeatureUnion

**Data Preparation**

In [0]:
# load data
def load_data():
    file_name = './sampleCode.txt' # input code file
    data = open(file_name, 'r')
    lines = data.readlines()
    return lines

# clean data
def clean_data(input_code):
  
    #find all the code within the pre tags
    all_found = re.findall(r'<pre[\s\S]*?<\/pre>', input_code, re.MULTILINE)
    #print(all_found)
    
    #clean the unnecessary tags
    clean_string = lambda x: x.replace('&lt;', '<').replace('&gt;', '>').replace('</pre>', '').replace('\n', '')
    all_found = [clean_string(item) for item in all_found]
    #print(all_found)
    
    #get the language 
    get_language = lambda x: re.findall(r'<pre lang="(.*?)">', x, re.MULTILINE)[0]
    lang_items = [get_language(item) for item in all_found]
    #print(all_found)
    #print(lang_items)
    
    #remove all of the pre tags that contain the language
    remove_lang = lambda x: re.sub(r'<pre lang="(.*?)">', "", x)
    all_found = [remove_lang(item) for item in all_found]
    #print(all_found)
    
    #return the code between pre tags and their corresponding language
    return (all_found, lang_items) 

In [0]:
all_samples = ''.join(load_data())
cleaned_data, languages = clean_data(all_samples)

df = pd.DataFrame()
df['language'] = languages
df['data'] = cleaned_data

In [0]:
df

Unnamed: 0,language,data
0,cpp,#include <iostream> using namespace std; i...
1,Swift,@objc func handleTap(sender: UITapGestureRecog...
2,Java,import java.applet.*;import java.awt.*;public ...
3,JavaScript,"var my_dataset = [ { id: ""1"", te..."
4,cpp,#include <iostream> using namespac...
5,Java,class Integers { public static void m...
6,JavaScript,"function myFunction() { var str = ""Hello Code..."
7,JavaScript,"function palin(){var a,no,b,temp=0;no=Number(d..."
8,Java,class Condition { public static void ...
9,cpp,#include <iostream> using namespac...


In [0]:
label_enc = LabelEncoder()
df['language_label'] = label_enc.fit_transform(df['language'])

In [0]:
df.head()

Unnamed: 0,language,data,language_label
0,cpp,#include <iostream> using namespace std; i...,3
1,Swift,@objc func handleTap(sender: UITapGestureRecog...,2
2,Java,import java.applet.*;import java.awt.*;public ...,0
3,JavaScript,"var my_dataset = [ { id: ""1"", te...",1
4,cpp,#include <iostream> using namespac...,3


In [0]:
label_enc.classes_

array(['Java', 'JavaScript', 'Swift', 'cpp'], dtype=object)

In [0]:
#create models
def create_models():
    models = {}
    models['LinearSVC'] = LinearSVC()
    models['LogisticRegression'] = LogisticRegression()
    models['RandomForestClassifier'] = RandomForestClassifier()
    models['DecisionTreeClassifier'] = DecisionTreeClassifier()
    models['MultinomialNB'] = MultinomialNB()
    return models

In [0]:
# shuffle and split dataset
X_input, y_input = shuffle(df['data'], df['language_label'], random_state=7)

X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.7)

In [0]:
# function to calculate accuracy
def calculate_accuracy(actual_y, predicted_y, model_name, train_time, predict_time):
    print('Model Name: ' + model_name)
    print('Train time: ', round(train_time, 2))
    print('Predict time: ', round(predict_time, 2))
    print('Model Accuracy: {:.4f}'.format(accuracy_score(actual_y, predicted_y)))
    print('')
    print(classification_report(actual_y, predicted_y, digits=4))
    print("=======================================================")

In [0]:

def test_models(X_train, y_train, X_test, y_test, models):

    trained_models = {}
    
    vectorizer = FeatureUnion([('tfidf_vect', TfidfVectorizer())])
    
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    for key in models:
        model_name = key
        model = models[key]
        t1 = time.time()
        model.fit(X_train, y_train)
        t2 = time.time()
        predicted_y = model.predict(X_test)
        t3 = time.time()
        
        calculate_accuracy(y_test, predicted_y, model_name, t2 - t1, t3 - t2)        
        trained_models[model_name] = model
        
    return (trained_models, vectorizer)

In [0]:
models = create_models()
trained_models, fitted_vectorizer = test_models(X_train, y_train, X_test, y_test, models)

Model Name: LinearSVC
Train time:  0.0
Predict time:  0.0
Model Accuracy: 0.3000

              precision    recall  f1-score   support

           0     0.1429    1.0000    0.2500         1
           1     0.0000    0.0000    0.0000         4
           2     0.0000    0.0000    0.0000         3
           3     0.6667    1.0000    0.8000         2

    accuracy                         0.3000        10
   macro avg     0.2024    0.5000    0.2625        10
weighted avg     0.1476    0.3000    0.1850        10

Model Name: LogisticRegression
Train time:  0.0
Predict time:  0.0
Model Accuracy: 0.2000

              precision    recall  f1-score   support

           0     0.1111    1.0000    0.2000         1
           1     0.0000    0.0000    0.0000         4
           2     0.0000    0.0000    0.0000         3
           3     1.0000    0.5000    0.6667         2

    accuracy                         0.2000        10
   macro avg     0.2778    0.3750    0.2167        10
weighted avg

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
