# LightGBM model

In [15]:
import os
import sys
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import openai
import category_encoders as ce
from tempfile import TemporaryDirectory
from sklearn.metrics import roc_auc_score, log_loss

# go up two directories
sys.path.append('../../../')

from codecompasslib.API.drive_operations import get_creds_drive, list_shared_drive_contents, download_csv_as_pd_dataframe, upload_df_to_drive_as_csv
from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos

In [16]:
DRIVE_ID = "0AL1DtB4TdEWdUk9PVA"
DATA_FOLDER = "13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx"

creds = get_creds_drive()
list_shared_drive_contents(creds=creds, folder_id=DATA_FOLDER, drive_id=DRIVE_ID)


Files in the folder:
df_embedded_3103.csv (1V7P-bjQCLmFg_7ffG-s-caI6Il6B7Zvp)
test.csv (1hAP9CD6iP4FSZP4RSRm2CYUrS2KF_Lhf)
Embedded_Shell_dataset.csv (11emrksL5Wtkxz74F4ZNG7aVGlFEO9CJ5)
Embedded_Ruby_dataset.csv (1NqgktN6-inwI2kjhOZcqK2z8yPMQGz1I)
Embedded_Python_dataset.csv (1eIRiKdPDhyJdWyXmo0sI-x8AfA_qEhCl)
Embedded_PHP_dataset.csv (1DPOA1sTfewo1J9-y9ScW8_Br_2xwFBDE)
Embedded_Jupyter Notebook_dataset.csv (1-5LumCPIn9zSQOz2B95YjxHE3CSsF6u4)
Embedded_Java_dataset.csv (17P4T41NxcBlJ4d5ZRp-TiMHIDC9GPM4N)
Embedded_JavaScript_dataset.csv (1rOTrBEO3jpTs8O8XHeH87N_lly8l96CV)
Embedded_C_dataset.csv (1J4Ke7ovrVArP9gN99gRNEFwyt64drNre)
Embedded_C++_dataset.csv (1exN3p8ElxD_rDFvKPh1Ojf2Az4zV4NF1)
Embedded_C#_dataset.csv (1LbSVpDfCi-6f2uz0w-R05wPatNI0o1p7)
df_with_embeddings.csv (1ob1LmG5vjvkbhhx7ZHkimXFMNXOvSpMq)
uploaded_dataset.csv (1WSgwAhzNbSqC6e_RRBDHpgpQCnGZvVcc)
dataset.csv (1AdJGrqauyeOzjyaT0752hTJrkQjbEvfb)
allReposCleaned.csv (1jIYBQQJNo2s1bo3LHlYgKzUNNM0ueuhQ)


True

### Load the datasets

In [17]:
# Embedded dataset is big and has slow retrieval, waiting for maud for best database options for faster retrieval

# Load embedded and non-embedded dataset
df_non_embedded = download_csv_as_pd_dataframe(creds=creds, file_id="1WSgwAhzNbSqC6e_RRBDHpgpQCnGZvVcc")
df_embedded = download_csv_as_pd_dataframe(creds=creds, file_id="1V7P-bjQCLmFg_7ffG-s-caI6Il6B7Zvp")


Download 11%.

Download 23%.

Download 35%.

Download 47%.

Download 59%.

Download 71%.

Download 83%.

Download 95%.

Download 100%.


  return read_csv(fh)



Download 4%.

Download 8%.

Download 12%.

Download 16%.

Download 20%.

Download 24%.

Download 28%.

Download 32%.

Download 36%.

Download 40%.

Download 45%.

Download 49%.

Download 53%.

Download 57%.

Download 61%.

Download 65%.

Download 69%.

Download 73%.

Download 77%.

Download 81%.

Download 86%.

Download 90%.

Download 94%.

Download 98%.

Download 100%.


### Prep the dataframe for lightGBM model

Getting starred repos

In [22]:
# choosing a target user to generate recommendations for
target_user = 'Rameshwar0852'

In [23]:
starred_by_target = get_stared_repos(target_user)
starred_repo_ids = ids = [item['id'] for item in starred_by_target[0]]

### Grabbing needed columns

In [25]:
# Adding stars column to the embedded dataset (add any other column if you want to use it for a model)
df_merged = pd.merge(df_embedded, df_non_embedded[['id', 'stars', 'language']], on='id', how='left')
# turn stars column into integer column
df_merged['stars'] = df_merged['stars'].apply(lambda x: int(x))

In [28]:
# add target column which will be 1 if the user has starred the repo and 0 otherwise
df_merged['target'] = df_merged['id'].apply(lambda x: 1 if x in starred_repo_ids else 0)

### Train lightGBM model on data

In [29]:
MAX_LEAF = 64
MIN_DATA = 20
NUM_OF_TREES = 100
TREE_LEARNING_RATE = 0.15
EARLY_STOPPING_ROUNDS = 20
METRIC = "auc"
SIZE = "sample"

In [30]:
params = {
    "task": "train",
    "boosting_type": "gbdt",
    "num_class": 1,
    "objective": "binary",
    "metric": METRIC,
    "num_leaves": MAX_LEAF,
    "min_data": MIN_DATA,
    "boost_from_average": True,
    # set it according to your cpu cores.
    "num_threads": 20,
    "feature_fraction": 0.8,
    "learning_rate": TREE_LEARNING_RATE,
}

In [31]:
X = df_merged.drop(columns=['target', 'id', 'owner_user'])
y = df_merged['target']

X_combined, X_test, y_combined, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.1, random_state=42, stratify=y_combined)

# combine X_train and y_train
train_data = pd.concat([X_train, y_train], axis=1)
valid_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [32]:
nume_cols = ["embedding_" + str(i) for i in range(256)] + ["stars"]
cate_cols = ["language"]	
label_col = "target"

In [33]:
ord_encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)

def encode_csv(df, encoder, label_col, typ="fit"):
    if typ == "fit":
        df = encoder.fit_transform(df)
    else:
        df = encoder.transform(df)
    y = df[label_col].values
    del df[label_col]
    return df, y

train_x, train_y = encode_csv(train_data, ord_encoder, label_col)
valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, "transform")
test_x, test_y = encode_csv(test_data, ord_encoder, label_col, "transform")

print("Train Data Shape: X: {trn_x_shape}; Y: {trn_y_shape}.\nValid Data Shape: X: {vld_x_shape}; Y: {vld_y_shape}.\nTest Data Shape: X: {tst_x_shape}; Y: {tst_y_shape}.\n"
      .format(trn_x_shape=train_x.shape,
              trn_y_shape=train_y.shape,
              vld_x_shape=valid_x.shape,
              vld_y_shape=valid_y.shape,
              tst_x_shape=test_x.shape,
              tst_y_shape=test_y.shape,))

train_x.head()

Train Data Shape: X: (974068, 258); Y: (974068,).
Valid Data Shape: X: (108230, 258); Y: (108230,).
Test Data Shape: X: (120256, 258); Y: (120256,).



Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_248,embedding_249,embedding_250,embedding_251,embedding_252,embedding_253,embedding_254,embedding_255,stars,language
66436,-0.02661,-0.03397,-0.0584,0.00207,-0.08496,-0.07605,-0.02669,0.10345,-0.05478,0.1516,...,-0.010445,-0.1337,-0.129,0.00489,0.02968,0.06305,-0.05823,0.03165,0,1
70728,-0.06226,-0.0877,-0.03235,0.0757,0.007385,0.1084,-0.01454,0.1733,0.01468,-0.01625,...,0.0589,0.04074,-0.07135,-0.03992,-0.03546,0.0843,-0.0769,0.01101,2,2
275250,-0.0655,-0.09564,-0.05142,-0.04962,-0.02959,-0.0681,-0.02458,0.103,-0.05774,0.1504,...,-0.0552,-0.0329,-0.0228,0.0195,-0.04843,-0.037,-0.0442,0.093,1,1
825590,-0.0509,0.03032,-0.0242,0.0654,-0.000988,-0.05905,0.04416,0.01932,-0.00618,0.12274,...,0.04037,-0.07117,-0.02058,0.02441,-0.0527,-0.05798,0.04056,0.04953,0,3
725083,-0.01985,-0.02382,-0.05206,0.03555,0.03558,-0.01424,-0.1249,0.1335,-0.02283,-0.03394,...,0.08,-0.12115,-0.04892,-0.09045,0.03032,-0.0384,-0.0658,0.06305,0,1


In [34]:
lgb_train = lgb.Dataset(train_x, train_y.reshape(-1), params=params, categorical_feature=cate_cols)
lgb_valid = lgb.Dataset(valid_x, valid_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)
lgb_test = lgb.Dataset(test_x, test_y.reshape(-1), reference=lgb_train, categorical_feature=cate_cols)
lgb_model = lgb.train(params,
                      lgb_train,
                      num_boost_round=NUM_OF_TREES,
                      valid_sets=lgb_valid,
                      categorical_feature=cate_cols,
                      callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS)])

[LightGBM] [Info] Number of positive: 6, number of negative: 974062
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.438007 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65775
[LightGBM] [Info] Number of data points in the train set: 974068, number of used features: 258
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000006 -> initscore=-11.997471
[LightGBM] [Info] Start training from score -11.997471
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[48]	valid_0's auc: 0.932366


In [35]:
test_preds = lgb_model.predict(test_x)
auc = roc_auc_score(np.asarray(test_y.reshape(-1)), np.asarray(test_preds))
logloss = log_loss(np.asarray(test_y.reshape(-1)), np.asarray(test_preds), eps=1e-12)
res_basic = {"auc": auc, "logloss": logloss}
print(res_basic)

{'auc': 0.4993721674774438, 'logloss': 0.000461882507551249}


In [36]:
# saving the model
with TemporaryDirectory() as tmp:
    save_file = os.path.join(tmp, "finished_LGBM.model")
    lgb_model.save_model(save_file)
    loaded_model = lgb.Booster(model_file=save_file)

### Generate recommendations

In [38]:
# make predictions for all the repos
df_test = df_merged.drop(columns=['id', 'owner_user'])
full_dataset_x, full_dataset_y = encode_csv(df_test, ord_encoder, label_col, "transform")
all_preds = lgb_model.predict(full_dataset_x)

In [48]:
# get sorted predictions with highest first
top_indices = np.argsort(all_preds)[::-1]

recommendations = []

counter = 0
for index in top_indices:
    if counter == 10:
        break
    # disregard if the repo is already starred by the user
    if df_merged.iloc[index]['id'] in starred_repo_ids:
        continue
    else:
        counter += 1
        recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index]))
        print("Repository ID:", df_merged.iloc[index]['id'], " | Owner:", df_merged.iloc[index]['owner_user'], " | Prediction:", all_preds[index])


Repository ID: 40250091.0  | Owner: danishkhan  | Prediction: 1.0
Repository ID: 21378463.0  | Owner: bblzjp  | Prediction: 1.0
Repository ID: 559467658.0  | Owner: lujiacn  | Prediction: 1.0
Repository ID: 134444559.0  | Owner: nkprince007  | Prediction: 1.0
Repository ID: 549422245.0  | Owner: mozmut  | Prediction: 1.0
Repository ID: 140065411.0  | Owner: soon14  | Prediction: 0.9999999999999774
Repository ID: 574946168.0  | Owner: chiyutianyi  | Prediction: 0.9999999999766165
Repository ID: 702245469.0  | Owner: nuzulfikrie  | Prediction: 0.9955527587034938
Repository ID: 387313950.0  | Owner: jhon-jader  | Prediction: 0.40993307170474874
Repository ID: 548651131.0  | Owner: mrgius3ppe  | Prediction: 0.3780267309085086
