In [149]:
import json
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from dateutil import parser

In [153]:
# Load raw json data
with open('repositories.json', 'r') as f:
    data = json.load(f)

# Convert json to dataframe
df = pd.json_normalize(data)

# Transform datetime stamps to unix
df['created_at'] = df['created_at'].apply(lambda x: parser.parse(x).timestamp())
df['updated_at'] = df['updated_at'].apply(lambda x: parser.parse(x).timestamp())
df['pushed_at'] = df['pushed_at'].apply(lambda x: parser.parse(x).timestamp())

# One-Hot-Encoding of categorical values
languages = pd.get_dummies(df['language'], prefix='language')
visibility = pd.get_dummies(df['visibility'], prefix='visibility')
license = pd.get_dummies(df['license.spdx_id'], prefix='license')
topics = df['topics'].apply(lambda x: pd.Series([1] * len(x), index=x)).fillna(0)
topics.rename(columns=lambda x: 'topic_' + x, inplace=True)

# Add one-hot-encoded features to dataframe
df = pd.concat([df, languages, topics, visibility, license], axis=1)

# Binary encode true/false values
df = df.replace({True: 1, False: 0})

# Feature and target selection
target = 'stargazers_count'
features = ['private', 'fork', 'size', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki',
            'has_pages', 'has_discussions', 'forks_count', 'archived', 'disabled',
            'open_issues_count', 'allow_forking', 'is_template', 'web_commit_signoff_required',
            'forks', 'score', 'created_at', 'updated_at', 'pushed_at'] + list(languages.columns) \
            + list(topics.columns)[0:250] + list(license.columns)

# Feature and target extraction
X = df[features]
y = df[target]

# Feature normalization
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

3370


In [None]:
print(f"X.shape: {X.shape}")

In [151]:
# Training and test set splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Random forest regression
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# XGboost
xgb = xgb.XGBRegressor()
xgb.fit(X_train, y_train)

In [152]:
# Making predictions on the testing set
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
xgb_pred = xgb.predict(X_test)

# Evaluating the model
lr_mse = mean_squared_error(y_test, lr_pred)
rf_mse = mean_squared_error(y_test, rf_pred)
xgb_mse = mean_squared_error(y_test, xgb_pred)

# Print results
print("[Mean square errors]")
print("Linear regression: ", lr_mse)
print("Random forest:", rf_mse)
print("XGBoost:", xgb_mse)

[Mean square errors]
Linear regression:  8.018735774798722e+32
Random forest: 328762560.6315215
XGBoost: 405968190.2820942


In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

pca = PCA(n_components=X.shape[1], svd_solver="auto")
pca.fit(X)


# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Plot the explained variance ratio
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA - Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.9, '95% cut-off threshold', color = 'red', fontsize=12)
plt.grid()
plt.show()

# Find the number of components that explain at least 95% of the variance
# ~ Take about 120 and preserve most of the variance.
num_components_95 = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f'Number of components that explain at least 95% of the variance: {num_components_95}')