In [7]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np

In [8]:
# Load model and features
model = CatBoostClassifier()
model.load_model("stylometry_classifier.cbm")

<catboost.core.CatBoostClassifier at 0x106e47f20>

In [9]:
samples = pd.read_csv("test_samples.csv", index_col='user_id')

In [10]:
samples

Unnamed: 0_level_0,javaKeywords_enum,ASTNodeTypesTF_EnumConstantDeclaration,ASTNodeBigramsTF_ClassDeclaration_EnumDeclaration,ASTNodeBigramsTF_SwitchStatementCase_LocalVariableDeclaration,ASTNodeBigramsTF_Assignment_TernaryExpression,ASTNodeTypesTF_SwitchStatement,ASTNodeBigramsTF_SwitchStatementCase_ReturnStatement,ASTNodeBigramsTF_ConstructorDeclaration_LocalVariableDeclaration,ASTNodeBigramsTF_ClassCreator_This,ASTNodeBigramsTF_SwitchStatementCase_SwitchStatement,...,ASTNodeBigramsTF_CompilationUnit_ClassDeclaration,ASTNodeBigramsTF_VariableDeclarator_ClassCreator,ASTNodeBigramsTF_BlockStatement_LocalVariableDeclaration,whiteSpaceRatio,ASTNodeBigramsTF_FormalParameter_ReferenceType,ASTNodeBigramsTF_MethodDeclaration_FormalParameter,ASTNodeTypesTF_MethodDeclaration,ASTNodeTypesTF_FormalParameter,ASTNodeBigramsTF_ClassDeclaration_MethodDeclaration,ln(numSpaces/length)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59-274,,,,,,,,,,,...,0.003774,0.003774,0.022642,0.413545,0.011321,0.015094,0.011278,0.015038,0.011321,0.065749
54-119,,,,,,0.005291,0.026596,,,,...,0.005319,0.010638,0.031915,0.703499,0.005319,0.026596,0.021164,0.026455,0.021277,0.380541
13-779,,,,0.006897,,0.006873,,,,,...,0.003448,0.003448,0.044828,1.382022,0.003448,0.003448,0.003436,0.003436,0.003448,0.501814
75-597,,,,,,,,,,,...,0.004444,0.004444,0.004444,0.523985,0.004444,0.004444,0.004425,0.004425,0.004444,0.19209
67-736,,,,,,,,,,,...,0.008403,0.008403,0.033613,0.274691,0.008403,0.008403,0.008333,0.008333,0.008403,0.09201
32-49,,,,,,,,,,,...,0.004082,0.004082,0.012245,1.111619,0.016327,0.020408,0.01626,0.020325,0.016327,0.457106
52-337,,,,,0.002584,,,,,,...,0.002584,0.010336,0.010336,0.472488,0.020672,0.03876,0.018041,0.043814,0.018088,0.115353
68-97,,,,,,,,,,,...,0.004386,0.004386,0.048246,0.4266,0.004386,0.004386,0.004367,0.004367,0.004386,0.098505


In [11]:
model.predict(samples)

array([[59],
       [54],
       [13],
       [75],
       [67],
       [32],
       [52],
       [68]])

In [716]:
# Get feature importance
feature_importance = model.get_feature_importance(type="PredictionValuesChange")

In [717]:
# Map feature importance to feature names
feature_names = samples.columns  # Assuming X is a pandas DataFrame
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Display top features
top_features = sorted_features[:10]  # Change 10 to the desired number of top features
top_features

[('tabsLeadLines', 2.9709930778314684),
 ('javaKeywords_throws', 2.8120819926236806),
 ('WordUnigramTF_io', 2.740204039279866),
 ('stdDevLineLength', 2.6384530163904905),
 ('ASTNodeBigramsTF_ClassDeclaration_FieldDeclaration', 2.6270131536549353),
 ('javaKeywords_private', 2.5478591296779345),
 ('WordUnigramTF_close', 2.311729848159934),
 ('ln(numEmptyLines/length)', 2.1799077091221797),
 ('ASTNodeBigramsTF_BlockStatement_BreakStatement', 2.104966457426089),
 ('ln(num_void/length)', 1.9854183801058114)]

In [718]:
vector1 = samples.iloc[0].fillna(0).values
vector2 = samples.iloc[1].fillna(0).values

In [719]:
from scipy.spatial.distance import cosine
cosine_similarity = 1 - cosine(vector1, vector2)
cosine_similarity

0.9176818238807076

In [720]:
# Initialize an empty list to store the decrease in cosine similarity for each feature
decreases = []

In [721]:
# Loop over each feature index
for i in range(len(vector1)):
    # Create copies of the vectors
    temp_vector1 = vector1.copy()
    temp_vector2 = vector2.copy()
    
    # Zero out the current feature
    temp_vector1[i] = 0
    temp_vector2[i] = 0
    
    # Compute the new cosine similarity
    temp_cosine_similarity = 1 - cosine(temp_vector1, temp_vector2)
    
    # Handle potential division by zero (cosine distance may return NaN)
    if np.isnan(temp_cosine_similarity):
        temp_cosine_similarity = 0
    
    # Calculate the decrease in cosine similarity
    decrease = cosine_similarity - temp_cosine_similarity
    
    # Append the result as a tuple (feature index, decrease)
    decreases.append((i, decrease))

In [722]:
# Assuming you have a DataFrame 'df' from which you extracted 'vector1' and 'vector2'
# Extract the feature names (column names)
feature_names = samples.columns.tolist()

# Convert the list of decreases into a DataFrame
decrease_df = pd.DataFrame(decreases, columns=['Feature Index', 'Decrease in Cosine Similarity'])

# Add feature names to the DataFrame
decrease_df['Feature Name'] = decrease_df['Feature Index'].apply(lambda x: feature_names[x])

# Reorder columns for better readability
decrease_df = decrease_df[['Feature Index', 'Feature Name', 'Decrease in Cosine Similarity']]

In [723]:
# Sort the DataFrame to find the features causing the biggest decrease
decrease_df_sorted = decrease_df.sort_values(by='Decrease in Cosine Similarity', ascending=False)
top_10_features = decrease_df_sorted.head(10)
top_10_features

Unnamed: 0,Feature Index,Feature Name,Decrease in Cosine Similarity
1428,1428,avgLineLength,0.036234
1446,1446,stdDevLineLength,0.018676
1366,1366,MaxDepthASTNode,0.008835
1394,1394,ln(num_class/length),0.002095
1369,1369,ln(num_private/length),0.002035
1451,1451,ln(num_for/length),0.001737
1483,1483,ln(num_else/length),0.001591
1404,1404,ln(num_void/length),0.001543
1452,1452,ln(num_static/length),0.001339
1430,1430,ln(numFunctions/length),0.001339


In [724]:
# Assuming 'samples' is a DataFrame and 'top_10_features' is defined as described
first_two_samples = samples[top_10_features['Feature Name'].to_list()].iloc[[0,1]]
first_two_samples

Unnamed: 0_level_0,avgLineLength,stdDevLineLength,MaxDepthASTNode,ln(num_class/length),ln(num_private/length),ln(num_for/length),ln(num_else/length),ln(num_void/length),ln(num_static/length),ln(numFunctions/length)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
59-274,23.848101,18.586158,18.0,-7.58172,-6.888572,-6.195425,-5.972282,-7.58172,-6.483107,-6.483107
54-119,29.344262,21.702825,16.0,-7.522941,-7.522941,-7.522941,-7.522941,-6.829794,-6.136647,-6.136647


In [725]:
from features import calculate_features_for_files, build_dataset

In [726]:
snippet_119 = '\nimport java.io.BufferedReader;\nimport java.io.InputStreamReader;\nimport java.util.Scanner;\nimport java.util.HashMap;\n\npublic class Solution {\n    public static void main(String[] args) {\n        Scanner in = new Scanner(new BufferedReader(new InputStreamReader(System.in)));\n        int t = in.nextInt(); // Scanner has functions to read ints, longs, strings, chars, etc.\n        for (int i = 1; i <= t; ++i) {\n            HashMap<String, Boolean> map = new HashMap<>();\n            int plotCount = 0;\n            int area = in.nextInt();\n            submitLocation(getXGuess(plotCount), getYGuess(plotCount));\n            int x = in.nextInt();\n            int y = in.nextInt();\n            map.put(x + "," + y, true);\n            plotCount++;\n            boolean failed = false;\n            while(x != 0 && !failed) {\n                if (x == -1) {\n                    failed = true;\n                } else {\n                    if (!map.containsKey(x + "," + y)) {\n                        map.put(x + "," + y, true);\n                        plotCount++;\n                    }\n\n                    submitLocation(getXGuess(plotCount), getYGuess(plotCount));\n                    x = in.nextInt();\n                    y = in.nextInt();\n                }\n            }\n\n            if (failed) {\n                break;\n            }\n        }\n    }\n    \n    public static int getXGuess(int plotCount) {\n        return (int) (Math.floor(plotCount / 9) * 3) + 2;\n    }\n\n    public static int getYGuess(int plotCount) {\n        switch (plotCount / 9) {\n            case 0: return 500;\n            case 1: return 500;\n            case 2: return 500;\n            case 3: return 500;\n            case 4: return 500;\n        }\n        return 500;\n    }\n\n    private static void submitLocation(int x, int y) {\n        System.out.println(x + " " + y);\n    }\n}'

In [None]:
modified_snippet_119 = '''
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Scanner;

public class Solution {
    public static void main(String[] args) {
        Scanner scanner = new Scanner(new BufferedReader(new InputStreamReader(System.in)));
        int testCases = scanner.nextInt();
        for (int i = 1; i <= testCases; i++) {
            int area = scanner.nextInt();
            int plotCount = 0;
            boolean failed = false;

            while (!failed) {
                int xGuess = calculateX(plotCount);
                int yGuess = calculateY();
                System.out.println(xGuess + " " + yGuess);

                int x = scanner.nextInt();
                int y = scanner.nextInt();

                if (x == -1 && y == -1) {
                    failed = true;
                } else if (x == 0 && y == 0) {
                    break;
                } else {
                    plotCount++;
                }

                if (plotCount % 50 == 0) {
                    System.out.println("Checkpoint reached at " + plotCount + " iterations.");
                }
            }

            if (failed) {
                System.out.println("Case #" + i + " failed.");
                break;
            } else {
                System.out.println("Case #" + i + " completed successfully.");
            }
        }
    }

    private static int calculateX(int count) {
        return (count / 9) * 3 + 2;
    }

    private static int calculateY() {
        return 500;
    }
}
'''

In [728]:
modified_set = calculate_features_for_files([(0, snippet_119, 0) , (-1, modified_snippet_119, -1)])
X_new = build_dataset([sample[1] for sample in modified_set])
X_new

Unnamed: 0,ASTNodeBigramsTF_Assignment_Literal,ASTNodeBigramsTF_Assignment_MemberReference,ASTNodeBigramsTF_Assignment_MethodInvocation,ASTNodeBigramsTF_BinaryOperation_BinaryOperation,ASTNodeBigramsTF_BinaryOperation_Cast,ASTNodeBigramsTF_BinaryOperation_Literal,ASTNodeBigramsTF_BinaryOperation_MemberReference,ASTNodeBigramsTF_BinaryOperation_MethodInvocation,ASTNodeBigramsTF_BlockStatement_BreakStatement,ASTNodeBigramsTF_BlockStatement_IfStatement,...,ln(num_return/length),ln(num_static/length),ln(num_switch/length),ln(num_void/length),ln(num_while/length),newLineBeforeOpenBrace,stdDevLineLength,stdDevNumParams,tabsLeadLines,whiteSpaceRatio
0,0.005319,0.015957,0.010638,0.026596,0.005319,0.053191,0.079787,0.005319,0.005319,0.015957,...,-5.57649,-6.136106,-7.5224,-6.829253,-7.5224,0.0,21.546977,0.433013,0.0,0.702578
1,0.006849,0.006849,,0.075342,,0.109589,0.089041,,0.013699,0.020548,...,-6.636603,-6.231137,,-7.32975,-7.32975,0.0,22.902054,0.471405,0.0,0.744851


In [729]:
missing_cols = set(samples.columns) - set(X_new.columns)
for col in missing_cols:
    X_new[col] = np.NaN

# Ensure the order of columns matches the training set
X_new = X_new[samples.columns]

# Step 4: Handle any potential NaN or infinite values
X_new = X_new.replace([np.inf, -np.inf], np.nan)

  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[col] = np.NaN
  X_new[co

In [730]:
X_new

Unnamed: 0,WordUnigramTF_maxAlpha,WordUnigramTF_caseN,WordUnigramTF_miw,WordUnigramTF_Solution1,WordUnigramTF_odds,WordUnigramTF_end_row,WordUnigramTF_getNextMove,WordUnigramTF_mult,WordUnigramTF_totalChanges,WordUnigramTF_perSwitchDamage,...,WordUnigramTF_nextInt,ASTNodeBigramsTF_BlockStatement_LocalVariableDeclaration,whiteSpaceRatio,WordUnigramTF_in,ASTNodeBigramsTF_MethodDeclaration_FormalParameter,ASTNodeBigramsTF_FormalParameter_ReferenceType,ASTNodeTypesTF_MethodDeclaration,ASTNodeTypesTF_FormalParameter,ASTNodeBigramsTF_ClassDeclaration_MethodDeclaration,ln(numSpaces/length)
0,,,,,,,,,,,...,0.061856,0.031915,0.702578,0.082474,0.026596,0.005319,0.021164,0.026455,0.021277,0.380746
1,,,,,,,,,,,...,0.054795,0.047945,0.744851,0.013699,0.013699,0.006849,0.020408,0.013605,0.020548,0.392787


In [731]:
model.predict(X_new)

array([[54],
       [55]])

In [732]:
# modified_first_two_samples = modified_set[]
# modified_first_two_samples
top_10_features['Feature Name'].to_list()
X_new_top_10_features = X_new[top_10_features['Feature Name'].to_list()]
X_new_top_10_features

Unnamed: 0,avgLineLength,stdDevLineLength,MaxDepthASTNode,ln(num_class/length),ln(num_private/length),ln(num_for/length),ln(num_else/length),ln(num_void/length),ln(num_static/length),ln(numFunctions/length)
0,29.833333,21.546977,16.0,-7.5224,-7.5224,-7.5224,-7.5224,-6.829253,-6.136106,-6.136106
1,27.792453,22.902054,14.0,-7.32975,-6.636603,-7.32975,-6.231137,-7.32975,-6.231137,-6.231137


In [733]:
first_two_samples

Unnamed: 0_level_0,avgLineLength,stdDevLineLength,MaxDepthASTNode,ln(num_class/length),ln(num_private/length),ln(num_for/length),ln(num_else/length),ln(num_void/length),ln(num_static/length),ln(numFunctions/length)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
59-274,23.848101,18.586158,18.0,-7.58172,-6.888572,-6.195425,-5.972282,-7.58172,-6.483107,-6.483107
54-119,29.344262,21.702825,16.0,-7.522941,-7.522941,-7.522941,-7.522941,-6.829794,-6.136647,-6.136647


In [734]:
row_from_X = X_new_top_10_features.iloc[1]  # Second row
row_from_samples = first_two_samples.iloc[0]  # First row
sample_difference = pd.concat([row_from_X, row_from_samples], axis=1).T
sample_difference.index = [119, 274]
sample_difference

Unnamed: 0,avgLineLength,stdDevLineLength,MaxDepthASTNode,ln(num_class/length),ln(num_private/length),ln(num_for/length),ln(num_else/length),ln(num_void/length),ln(num_static/length),ln(numFunctions/length)
119,27.792453,22.902054,14.0,-7.32975,-6.636603,-7.32975,-6.231137,-7.32975,-6.231137,-6.231137
274,23.848101,18.586158,18.0,-7.58172,-6.888572,-6.195425,-5.972282,-7.58172,-6.483107,-6.483107


In [735]:
vector1 = X_new.iloc[1].fillna(0).values
vector2 = samples.iloc[0].fillna(0).values

In [736]:
cosine_similarity = 1 - cosine(vector1, vector2)
cosine_similarity

0.9242739124382444

In [737]:
sample_difference['cosine_similarity'] = cosine_similarity
sample_difference

Unnamed: 0,avgLineLength,stdDevLineLength,MaxDepthASTNode,ln(num_class/length),ln(num_private/length),ln(num_for/length),ln(num_else/length),ln(num_void/length),ln(num_static/length),ln(numFunctions/length),cosine_similarity
119,27.792453,22.902054,14.0,-7.32975,-6.636603,-7.32975,-6.231137,-7.32975,-6.231137,-6.231137,0.924274
274,23.848101,18.586158,18.0,-7.58172,-6.888572,-6.195425,-5.972282,-7.58172,-6.483107,-6.483107,0.924274


In [738]:
print(sample_difference)

     avgLineLength  stdDevLineLength  MaxDepthASTNode  ln(num_class/length)  \
119      27.792453         22.902054             14.0              -7.32975   
274      23.848101         18.586158             18.0              -7.58172   

     ln(num_private/length)  ln(num_for/length)  ln(num_else/length)  \
119               -6.636603           -7.329750            -6.231137   
274               -6.888572           -6.195425            -5.972282   

     ln(num_void/length)  ln(num_static/length)  ln(numFunctions/length)  \
119             -7.32975              -6.231137                -6.231137   
274             -7.58172              -6.483107                -6.483107   

     cosine_similarity  
119           0.924274  
274           0.924274  
