In [1]:
import pandas as pd
import os
import pickle
df = pd.read_pickle(r"C:\Users\susha\OneDrive\Desktop\ai debugger\buggy_dataset\bugfixes_train.pickle")
print(df.columns)
df.head()
print(df.shape)

Index(['after_merge', 'before_merge', 'filename', 'full_file_code_after_merge',
       'full_file_code_before_merge', 'function_name', 'url',
       'source code and errors', 'full_traceback', 'traceback_type',
       'before_merge_without_docstrings', 'after_merge_without_docstrings',
       'before_merge_docstrings', 'after_merge_docstrings',
       'path_to_snippet_before_merge', 'path_to_snippet_after_merge'],
      dtype='object')
(14118, 16)


In [2]:
# Extract just the buggy code
buggy_cls_df = df[['before_merge']].copy()
buggy_cls_df = buggy_cls_df.rename(columns={'before_merge': 'code'})
buggy_cls_df['label'] = 1  # Mark as buggy


In [3]:
def load_buggy_pickle(path):
    with open(path, "rb") as f:
        data = pickle.load(f, encoding="latin1")
    df = pd.DataFrame(data)
    df = df[['before_merge']].copy()
    df = df.rename(columns={'before_merge': 'code'})
    df['label'] = 1
    return df

# Paths to val/test buggy data
buggy_val_path = r"C:\Users\susha\OneDrive\Desktop\ai debugger\buggy_dataset\bugfixes_valid.pickle"
buggy_test_path = r"C:\Users\susha\OneDrive\Desktop\ai debugger\buggy_dataset\bugfixes_test.pickle"



In [None]:
import pandas as pd

def load_buggy_pickle(path):
    df = pd.read_pickle(path)
    print(f"Loaded {len(df)} rows from {path}")
    return df




In [5]:
buggy_val_df = load_buggy_pickle(buggy_val_path)
buggy_test_df = load_buggy_pickle(buggy_test_path)


Loaded 9457 rows from C:\Users\susha\OneDrive\Desktop\ai debugger\buggy_dataset\bugfixes_valid.pickle
Loaded 161 rows from C:\Users\susha\OneDrive\Desktop\ai debugger\buggy_dataset\bugfixes_test.pickle


In [6]:

# Combine all buggy
all_buggy_df = pd.concat([buggy_cls_df, buggy_val_df, buggy_test_df], ignore_index=True)


In [7]:
stable_path = r"C:\Users\susha\OneDrive\Desktop\ai debugger\stable_dataset\stable_code_train.pickle"

with open(stable_path, "rb") as f:
    stable_data = pd.read_pickle(f)

stable_df = pd.DataFrame(stable_data)
print(stable_df.columns)


Index(['before_merge', 'repo_name', 'filename', 'function_name',
       'path_to_source_file', 'commit', 'path_to_snippet_before_merge',
       'times'],
      dtype='object')


In [8]:
# Use 'before_merge' as the clean code column
stable_cls_df = stable_df[['before_merge']].copy()
stable_cls_df = stable_cls_df.rename(columns={'before_merge': 'code'})
stable_cls_df['label'] = 0  # Mark as clean


In [9]:
def load_stable_pickle(path):
    with open(path, "rb") as f:
        data = pd.read_pickle(f)
    df = pd.DataFrame(data)
    df = df[['before_merge']].copy()
    df = df.rename(columns={'before_merge': 'code'})
    df['label'] = 0
    return df

# Paths
val_path = r"C:\Users\susha\OneDrive\Desktop\ai debugger\stable_dataset\stable_code_valid.pickle"
test_path = r"C:\Users\susha\OneDrive\Desktop\ai debugger\stable_dataset\stable_code_test.pickle"

# Load remaining stable data
stable_val_df = load_stable_pickle(val_path)
stable_test_df = load_stable_pickle(test_path)

# Combine all clean code
all_stable_df = pd.concat([stable_cls_df, stable_val_df, stable_test_df], ignore_index=True)

In [10]:
# Combine and shuffle
combined_df_all = pd.concat([all_buggy_df, all_stable_df], ignore_index=True)
combined_df_all = combined_df_all.sample(frac=1, random_state=42).reset_index(drop=True)

print(combined_df_all['label'].value_counts())
print(combined_df_all.sample(5))


# Decode any byte strings
combined_df_all['code'] = combined_df_all['code'].apply(
    lambda x: x.decode('utf-8', errors='ignore') if isinstance(x, bytes) else x
)

# Preview
print(combined_df_all.sample(3)['code'].values[0])


label
0.0    2655238
1.0      14118
Name: count, dtype: int64
                                                      code  label after_merge  \
1596892  b'    def redraw(self, *args):\n        self.d...    0.0         NaN   
1191105  b'    def command(self):\n        """The comma...    0.0         NaN   
724202   b'    def __call__(self, data):\n        if is...    0.0         NaN   
599775   b"def downgrade():\n    '''\n    Downgrade the...    0.0         NaN   
777538   b'    def remove_all(self):\n        """\n    ...    0.0         NaN   

        before_merge filename full_file_code_after_merge  \
1596892          NaN      NaN                        NaN   
1191105          NaN      NaN                        NaN   
724202           NaN      NaN                        NaN   
599775           NaN      NaN                        NaN   
777538           NaN      NaN                        NaN   

        full_file_code_before_merge function_name  url source code and errors  \
1596892  

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1) Ensure 'code' exists and is string; ensure 'label' exists
assert 'code' in combined_df_all.columns, "Missing 'code' column"
assert 'label' in combined_df_all.columns, "Missing 'label' column"

# 2) Normalize label values to {0,1} and drop NaNs
#    (handles cases like 'buggy'/'clean', True/False, etc.)
label_map = {
    'buggy': 1, 'Buggy': 1, 'BUGGY': 1, True: 1, 1: 1,
    'clean': 0, 'stable': 0, 'Clean': 0, 'STABLE': 0, False: 0, 0: 0
}
combined_df_all['label'] = combined_df_all['label'].map(label_map).astype('float')  # map unknowns to NaN

# 3) Basic cleaning
combined_df_all['code'] = combined_df_all['code'].astype(str)
combined_df_all = combined_df_all.dropna(subset=['code', 'label']).copy()

# 4) Optional: drop empties/whitespace-only code
combined_df_all = combined_df_all[combined_df_all['code'].str.strip().ne('')]

# 5) Make labels integer
combined_df_all['label'] = combined_df_all['label'].astype(int)

# 6) Quick sanity checks
print("Label counts (after cleaning):")
print(combined_df_all['label'].value_counts(dropna=False))
assert set(combined_df_all['label'].unique()) <= {0,1}, "Labels must be 0/1 only"

# 7) If either class is too small, adjust stratify usage
min_class = combined_df_all['label'].value_counts().min()

X = combined_df_all['code'].reset_index(drop=True)
y = combined_df_all['label'].reset_index(drop=True)

if min_class < 2:
    # Not enough samples to stratify; fall back to regular split
    print("‚ö†Ô∏è Too few samples in one class for stratify. Falling back to non-stratified split.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
else:
    # Safe to stratify
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

print(X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts())


Label counts (after cleaning):
label
0    2655238
1      14118
Name: count, dtype: int64
(2135484,) (533872,) label
0    2124190
1      11294
Name: count, dtype: int64 label
0    531048
1      2824
Name: count, dtype: int64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    combined_df_all['code'], 
    combined_df_all['label'], 
    test_size=0.2, 
    stratify=combined_df_all['label'], 
    random_state=42
)


In [16]:
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w+',
    ngram_range=(1, 2),  # unigrams + bigrams
    max_features=50000
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [30]:
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train_tfidf, y_train)


In [31]:
y_pred = clf.predict(X_test_tfidf)

print("üîç Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("üî¢ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


üîç Classification Report:
              precision    recall  f1-score   support

           0     0.9978    0.9411    0.9686    531048
           1     0.0524    0.6133    0.0966      2824

    accuracy                         0.9393    533872
   macro avg     0.5251    0.7772    0.5326    533872
weighted avg     0.9928    0.9393    0.9640    533872

üî¢ Confusion Matrix:
[[499757  31291]
 [  1092   1732]]


In [32]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train_tfidf, y_train)

y_pred = rf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0     0.9950    0.9996    0.9973    531048
           1     0.4278    0.0598    0.1050      2824

    accuracy                         0.9946    533872
   macro avg     0.7114    0.5297    0.5511    533872
weighted avg     0.9920    0.9946    0.9926    533872

[[530822    226]
 [  2655    169]]


In [17]:
# 1. Install if needed (uncomment below if not installed)
# !pip install xgboost

# 2. Import
from xgboost import XGBClassifier

# 3. Train XGBoost
xgb = XGBClassifier(
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), 
    use_label_encoder=False, 
    eval_metric='logloss',
    n_estimators=100, 
    max_depth=6, 
    random_state=42,
    verbosity=1
)
xgb.fit(X_train_tfidf, y_train)

# 4. Predict
y_pred_xgb = xgb.predict(X_test_tfidf)

# 5. Evaluation
from sklearn.metrics import classification_report, confusion_matrix

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.84      0.91    531048
           1       0.02      0.64      0.04      2824

    accuracy                           0.84    533872
   macro avg       0.51      0.74      0.48    533872
weighted avg       0.99      0.84      0.91    533872

XGBoost Confusion Matrix:
[[445441  85607]
 [  1010   1814]]
