# Programming Language and File Format Detection

In [1]:
import pandas as pd

df = pd.read_csv('dataset.csv')

df.head()

Unnamed: 0,id,file_path,file_size,line_count,extenstion,language
0,1,./dataset\Markdown\000001.md,34784,572,md,Markdown
1,2,./dataset\XML\000002.props,3013,44,props,XML
2,3,./dataset\Text\000003.txt,1076,21,txt,Text
3,4,./dataset\Markdown\000004.md,8105,84,md,Markdown
4,5,./dataset\Markdown\000005.md,2763,41,md,Markdown


### exploring number of classes

In [2]:
print(len(df['extenstion'].unique()))
print(len(df['language'].unique()))

129
77


### Combining the 2 types of classes

In [3]:
df['extenstion/language'] = df['extenstion'] + '/' + df['language']
len(df['extenstion/language'].unique())

130

### Finding the "extra" class

In [4]:
extensions = {}
for el in df['extenstion/language'].unique():
    if el.split('/')[0] not in extensions:
        extensions[el.split('/')[0]] = el
    else:
        extensions[el.split('/')[0]] += '\n' + el
        print(extensions[el.split('/')[0]])


h/C
h/C++


### Checking the available data

In [5]:
df['extenstion/language'].value_counts()

dart/Dart         15345
rs/Rust           14128
cs/C#              9985
go/Go              9124
json/JSON          5545
                  ...  
nuspec/XML            1
cgi/Shell             1
iml/XML               1
mm/XML                1
escript/Erlang        1
Name: extenstion/language, Length: 130, dtype: int64

### Removing classes with insufficient samples

In [6]:
df = df[(df['extenstion/language'].value_counts() > 20)[df['extenstion/language']].values]
len(df['extenstion/language'].unique())

60

### Loading the actual data

In [7]:
def readfile(row):
    filepath = row['file_path']
    try:
        with open(filepath) as f:
            text = f.read()
        return text
    except:
        print(filepath)
        return ''

raw_samples = df.apply(readfile, axis=1)

samples = pd.DataFrame()

samples['text'] = raw_samples[raw_samples != '']
samples['extenstion/language'] = df['extenstion/language'][raw_samples != '']
samples['file_path'] = df['file_path'][raw_samples != '']



./dataset\JSON\001531.json
./dataset\JSON\001534.json
./dataset\JSON\002325.json
./dataset\C#\002449.cs
./dataset\C#\002454.cs
./dataset\JSON\002471.json
./dataset\JSON\002578.json
./dataset\JSON\002582.json
./dataset\JSON\003081.json
./dataset\C#\003808.cs
./dataset\C#\004404.cs
./dataset\C#\005094.cs
./dataset\JSON\011572.json
./dataset\JSON\011790.json
./dataset\Markdown\015536.md
./dataset\Markdown\015549.md
./dataset\Markdown\015781.md
./dataset\C\016130.c
./dataset\Markdown\016150.md
./dataset\C++\016164.cpp
./dataset\C++\016256.cpp
./dataset\Markdown\016503.md
./dataset\Markdown\017070.md
./dataset\Markdown\017071.md
./dataset\Markdown\017072.md
./dataset\Markdown\017073.md
./dataset\Markdown\017075.md
./dataset\Markdown\017076.md
./dataset\Markdown\017077.md
./dataset\Markdown\017078.md
./dataset\Markdown\017079.md
./dataset\Markdown\017080.md
./dataset\Markdown\017081.md
./dataset\Markdown\017082.md
./dataset\Markdown\017083.md
./dataset\Markdown\017086.md
./dataset\Markdown\0

### Sampling the data for training

In [30]:
from sklearn.utils import resample
def generate_samples(samples, n_samples=20):
    sample_list = []
    for ex in samples['extenstion/language'].unique():
        sample_list.append(resample(samples[samples['extenstion/language'] == ex], n_samples=n_samples))

    return pd.concat(sample_list)

small_sample = generate_samples(samples, 100)
small_sample

Unnamed: 0,text,extenstion/language,file_path
42181,This is a living document and at times it will...,md/Markdown,./dataset\Markdown\042182.md
69847,"# Lint levels\n\nIn `rustc`, lints are divided...",md/Markdown,./dataset\Markdown\069848.md
47426,# printf() and stdio in the Julia runtime\n\n#...,md/Markdown,./dataset\Markdown\047427.md
17474,# Pattern Matching for C# 7\n\nPattern matchin...,md/Markdown,./dataset\Markdown\017475.md
47446,# [Conversion and Promotion](@id conversion-an...,md/Markdown,./dataset\Markdown\047447.md
...,...,...,...
84222,#version 450\n\nstruct Test\n{\n int empty_...,vert/GLSL,./dataset\GLSL\084223.vert
84334,; SPIR-V\n; Version: 1.0\n; Generator: Khronos...,vert/GLSL,./dataset\GLSL\084335.vert
84337,; SPIR-V\n; Version: 1.0\n; Generator: Khronos...,vert/GLSL,./dataset\GLSL\084338.vert
83049,#version 100\n\nstruct UBO\n{\n int func_ar...,vert/GLSL,./dataset\GLSL\083050.vert


### Vectorisation for training

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
vec.fit(small_sample['text'])
small_sample = generate_samples(samples, 20)
X = vec.transform(small_sample['text'])
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())



Unnamed: 0,00,000,0000,000000,00000000,0000000000,000000000000000,00000000000000000000000000000000000000000000000000,0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,00000000000000001111111111111111,...,ðÿ,ðÿœ,ðÿž,ðÿž¾,òî,úlo,ûm,ülo,œâ,ƒâ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Training the classifier

In [53]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(X, small_sample['extenstion/language'])
for i in range(20):
    small_sample = generate_samples(samples, 20)
    X = vec.transform(small_sample['text'])
    model.partial_fit(X, small_sample['extenstion/language'])

### Testing on random samples from the dataset

In [60]:
from sklearn.metrics import accuracy_score
test_sample = generate_samples(samples, 10)

predicted = model.predict(vec.transform(test_sample['text']))
predicted
accuracy_score(test_sample['extenstion/language'], predicted)

0.92

### testing how many lines of code needed for correct detection

In [61]:
test_file = ''
count = 1   
with open('dataset/Python/017741.py') as f:
    while True:
        line = f.readline()
        if not line:
            break
        test_file += line
        print(count, model.predict(vec.transform([test_file]))[0])
        count += 1

1 dart/Dart
2 go/Go
3 go/Go
4 go/Go
5 go/Go
6 go/Go
7 go/Go
8 py/Python
9 py/Python
10 py/Python
11 py/Python
12 py/Python
13 py/Python
14 py/Python
15 py/Python
16 py/Python
17 py/Python
18 py/Python
19 py/Python
20 py/Python
21 py/Python
22 py/Python
23 py/Python
24 py/Python
25 py/Python
26 py/Python
27 py/Python
28 py/Python
29 py/Python
30 py/Python
31 py/Python
32 py/Python
33 py/Python
34 py/Python
35 py/Python
36 py/Python
37 py/Python
38 py/Python
39 py/Python
40 py/Python
41 py/Python
42 py/Python
43 py/Python
44 py/Python
45 py/Python
46 py/Python
47 py/Python
48 py/Python
49 py/Python
50 py/Python
51 py/Python
52 py/Python
53 py/Python
54 py/Python
55 py/Python
56 py/Python
57 py/Python
58 py/Python


In [63]:
def count_lines(test_sample):
    test_file = ''
    count = 1   
    with open(test_sample['file_path']) as f:
        while True:
            line = f.readline()
            if not line:
                break
            test_file += line
            if model.predict(vec.transform([test_file]))[0] == test_sample['extenstion/language']:
                return count
            count += 1
    return count

test_sample_2 = test_sample[test_sample['extenstion/language'] == predicted]
results = test_sample_2.apply(count_lines, axis=1)

In [64]:
import numpy as np
print(np.min(results))
print(np.mean(results))
print(np.max(results))

1
3.335144927536232
128


In [65]:

from joblib import dump, load
dump(model, 'model.joblib')
dump(vec, 'vec.joblib')

['vec.joblib']