In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("total_scraped_data_v3.csv")
df = df.drop(columns=['Unnamed: 0'])
df.shape

(8506, 16)

In [5]:
df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count
0,1.0,5524,312.0,2020-05-19T02:37:13Z,882.0,0.0,MIT License,ManimCommunity/manim,351.0,33506.0,9951.0,4.0,1030.0,2022-05-09T17:53:54Z,https://github.com/ManimCommunity/manim,9951.0
1,1.0,224,16.0,2017-08-25T12:05:15Z,2590.0,0.0,MIT License,zalandoresearch/fashion-mnist,24.0,108395.0,9998.0,11.0,98.0,2022-05-09T09:30:18Z,https://github.com/zalandoresearch/fashion-mnist,9998.0
2,1.0,133,3.0,2018-01-09T09:48:49Z,2181.0,0.0,Apache License 2.0,Embedding/Chinese-Word-Vectors,42.0,1477.0,9976.0,6.0,147.0,2022-05-09T14:13:08Z,https://github.com/Embedding/Chinese-Word-Vectors,9976.0
3,1.0,6023,18.0,2018-07-11T18:28:58Z,1494.0,1.0,GNU Lesser General Public License v3.0,PySimpleGUI/PySimpleGUI,659.0,261676.0,9924.0,20.0,2750.0,2022-05-09T15:50:27Z,https://github.com/PySimpleGUI/PySimpleGUI,9924.0
4,1.0,541,9.0,2016-05-29T13:29:44Z,5612.0,0.0,MIT License,MorvanZhou/tutorials,13.0,62652.0,9944.0,9.0,66.0,2022-05-09T11:56:21Z,https://github.com/MorvanZhou/tutorials,9944.0


In [6]:
df.dtypes

allow_forking        float64
commit_count           int64
contributor_count    float64
created_at            object
forks                float64
github_pages         float64
license               object
name                  object
open_issues          float64
size                 float64
star_count           float64
topics_count         float64
total_issue_count    float64
updated_at            object
url                   object
watchers_count       float64
dtype: object

## Filling Null Values

In [7]:
df.isnull().sum()

allow_forking          0
commit_count           0
contributor_count    108
created_at             0
forks                  0
github_pages           0
license                0
name                   0
open_issues            0
size                   0
star_count             0
topics_count           0
total_issue_count      0
updated_at             0
url                    0
watchers_count         0
dtype: int64

In [8]:
for i in range(df.shape[0]):
    if df.loc[i, "total_issue_count"] == 0:
        df.loc[i, "total_issue_count"] = df.loc[i, "open_issues"]
    elif df.loc[i, "total_issue_count"] < df.loc[i, "open_issues"]:
        df.loc[i, "total_issue_count"] = df.loc[i, "open_issues"]

In [9]:
df["commit_count"] = df["commit_count"].replace(0, df["commit_count"].mean())
df["contributor_count"] = df["contributor_count"].fillna(df["contributor_count"].mean())

In [10]:
df[df["contributor_count"] == 0]

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count


In [11]:
df.isnull().sum()

allow_forking        0
commit_count         0
contributor_count    0
created_at           0
forks                0
github_pages         0
license              0
name                 0
open_issues          0
size                 0
star_count           0
topics_count         0
total_issue_count    0
updated_at           0
url                  0
watchers_count       0
dtype: int64

In [12]:
df.describe()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count
count,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0
mean,1.0,2933.376,90.902515,871.199036,0.214907,132.042558,81569.63,4617.587233,5.197625,650.399365,4617.587233
std,0.0,24305.93,507.283919,1126.250497,0.410782,318.062934,849439.1,2703.210825,5.349499,1770.912563,2703.210825
min,1.0,1.0,2.0,3.0,0.0,0.0,0.0,914.0,0.0,0.0,914.0
25%,1.0,143.0,12.0,303.0,0.0,15.0,1195.25,1982.0,0.0,64.0,1982.0
50%,1.0,481.5,36.0,571.0,0.0,47.0,6698.0,3996.0,4.0,210.0,3996.0
75%,1.0,1583.5,90.902515,1090.0,0.0,131.0,35584.0,6946.0,8.0,610.0,6946.0
max,1.0,1052496.0,42941.0,33854.0,1.0,8591.0,57016100.0,10280.0,20.0,63929.0,10280.0


## Feature Engineering

### Add how many years the project has been developed

In [13]:
df["created_at"] = pd.to_datetime(df["created_at"])
df["updated_at"] = pd.to_datetime(df["updated_at"])
df["years"] = (df["updated_at"] - df["created_at"]) / np.timedelta64(1, 'Y')
df["years"] = df["years"].astype(np.int32)

### Find the closed issue count, add closed issue rate (closed / all)

In [14]:
df["closed_issue"] = df["total_issue_count"] - df["open_issues"]
df["issue_close_rate"] = df["closed_issue"] / df["total_issue_count"]
df["issue_close_rate"] = df["issue_close_rate"].fillna(0)
df[["closed_issue", "issue_close_rate", "total_issue_count"]]

Unnamed: 0,closed_issue,issue_close_rate,total_issue_count
0,679.0,0.659223,1030.0
1,74.0,0.755102,98.0
2,105.0,0.714286,147.0
3,2091.0,0.760364,2750.0
4,53.0,0.803030,66.0
...,...,...,...
8501,370.0,0.943878,392.0
8502,0.0,0.000000,204.0
8503,1317.0,0.961314,1370.0
8504,8.0,0.888889,9.0


In [15]:
df.describe()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count,years,closed_issue,issue_close_rate
count,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0,8506.0
mean,1.0,2933.376,90.902515,871.199036,0.214907,132.042558,81569.63,4617.587233,5.197625,650.399365,4617.587233,5.717729,518.356807,0.661832
std,0.0,24305.93,507.283919,1126.250497,0.410782,318.062934,849439.1,2703.210825,5.349499,1770.912563,2703.210825,2.959044,1541.221495,0.271235
min,1.0,1.0,2.0,3.0,0.0,0.0,0.0,914.0,0.0,0.0,914.0,0.0,0.0,0.0
25%,1.0,143.0,12.0,303.0,0.0,15.0,1195.25,1982.0,0.0,64.0,1982.0,3.0,32.0,0.496309
50%,1.0,481.5,36.0,571.0,0.0,47.0,6698.0,3996.0,4.0,210.0,3996.0,5.0,132.0,0.727273
75%,1.0,1583.5,90.902515,1090.0,0.0,131.0,35584.0,6946.0,8.0,610.0,6946.0,8.0,457.0,0.881055
max,1.0,1052496.0,42941.0,33854.0,1.0,8591.0,57016100.0,10280.0,20.0,63929.0,10280.0,14.0,58901.0,1.0


## Changing repo size to KB

In [16]:
df["size"] = df["size"] / 1024.0

### Mapping Licence to Commercial Use

In [17]:
df["license"].value_counts()

MIT License                                                   3101
Apache License 2.0                                            1375
Other                                                         1338
Not Found                                                     1050
GNU General Public License v3.0                                556
BSD 3-Clause "New" or "Revised" License                        355
GNU General Public License v2.0                                148
GNU Affero General Public License v3.0                         127
BSD 2-Clause "Simplified" License                              120
Creative Commons Zero v1.0 Universal                            72
GNU Lesser General Public License v3.0                          52
Mozilla Public License 2.0                                      52
The Unlicense                                                   30
Creative Commons Attribution Share Alike 4.0 International      24
GNU Lesser General Public License v2.1                        

All of the licences are free for commercial usage, therefore this is not useful. Will drop `license` column

In [18]:
df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count,years,closed_issue,issue_close_rate
0,1.0,5524.0,312.0,2020-05-19 02:37:13+00:00,882.0,0.0,MIT License,ManimCommunity/manim,351.0,32.720703,9951.0,4.0,1030.0,2022-05-09 17:53:54+00:00,https://github.com/ManimCommunity/manim,9951.0,1,679.0,0.659223
1,1.0,224.0,16.0,2017-08-25 12:05:15+00:00,2590.0,0.0,MIT License,zalandoresearch/fashion-mnist,24.0,105.854492,9998.0,11.0,98.0,2022-05-09 09:30:18+00:00,https://github.com/zalandoresearch/fashion-mnist,9998.0,4,74.0,0.755102
2,1.0,133.0,3.0,2018-01-09 09:48:49+00:00,2181.0,0.0,Apache License 2.0,Embedding/Chinese-Word-Vectors,42.0,1.442383,9976.0,6.0,147.0,2022-05-09 14:13:08+00:00,https://github.com/Embedding/Chinese-Word-Vectors,9976.0,4,105.0,0.714286
3,1.0,6023.0,18.0,2018-07-11 18:28:58+00:00,1494.0,1.0,GNU Lesser General Public License v3.0,PySimpleGUI/PySimpleGUI,659.0,255.542969,9924.0,20.0,2750.0,2022-05-09 15:50:27+00:00,https://github.com/PySimpleGUI/PySimpleGUI,9924.0,3,2091.0,0.760364
4,1.0,541.0,9.0,2016-05-29 13:29:44+00:00,5612.0,0.0,MIT License,MorvanZhou/tutorials,13.0,61.183594,9944.0,9.0,66.0,2022-05-09 11:56:21+00:00,https://github.com/MorvanZhou/tutorials,9944.0,5,53.0,0.80303


`Watcher Count` is directly related with the star count, therefore we drop that column

## Categorize the star counts

In [19]:
df["star_category"] = df["star_count"].map(lambda x: (x // 1000) + 1)

In [20]:
df["star_category"].value_counts()

2.0     1765
4.0     1192
3.0      909
8.0      852
5.0      827
7.0      698
6.0      670
9.0      615
10.0     553
1.0      408
11.0      17
Name: star_category, dtype: int64

In [21]:
df = df[df["star_category"] < 11] # we want star counts between 0- 10000. drop anything above 100000

In [22]:
df["star_category"].value_counts()

2.0     1765
4.0     1192
3.0      909
8.0      852
5.0      827
7.0      698
6.0      670
9.0      615
10.0     553
1.0      408
Name: star_category, dtype: int64

In [23]:
ones = df[df["star_category"] == 1.0] # get all of them
twos = df[df["star_category"] == 2.0].sample(650)
threes = df[df["star_category"] == 3.0].sample(650)
fours = df[df["star_category"] == 4.0].sample(650)
fives = df[df["star_category"] == 5.0].sample(650)
sixes = df[df["star_category"] == 6.0].sample(650)
sevens = df[df["star_category"] == 7.0].sample(650)
eights = df[df["star_category"] == 8.0].sample(650)
nines = df[df["star_category"] == 9.0] # get all of them
tens = df[df["star_category"] == 10.0] # get all of them

sampled_df = pd.concat([ones, twos, threes, fours, fives, sixes, sevens, eights, nines, tens])
sampled_df.head()

Unnamed: 0,allow_forking,commit_count,contributor_count,created_at,forks,github_pages,license,name,open_issues,size,star_count,topics_count,total_issue_count,updated_at,url,watchers_count,years,closed_issue,issue_close_rate,star_category
3611,1.0,47.0,4.0,2017-12-30 03:13:50+00:00,247.0,0.0,MIT License,ternaus/TernausNet,8.0,83.871094,998.0,2.0,15.0,2022-05-08 12:08:17+00:00,https://github.com/ternaus/TernausNet,998.0,4,7.0,0.466667,1.0
3612,1.0,22.0,3.0,2017-05-15 08:19:48+00:00,128.0,0.0,Eclipse Public License 1.0,merantix/picasso,19.0,6.993164,997.0,4.0,24.0,2022-05-06 04:20:39+00:00,https://github.com/merantix/picasso,997.0,4,5.0,0.208333,1.0
3613,1.0,50.0,4.0,2016-03-31 06:30:46+00:00,380.0,0.0,MIT License,SpiderClub/smart_login,13.0,0.058594,996.0,0.0,25.0,2022-05-02 13:12:55+00:00,https://github.com/SpiderClub/smart_login,996.0,6,12.0,0.48,1.0
3617,1.0,27.0,3.0,2021-08-16 14:24:00+00:00,325.0,0.0,GNU General Public License v3.0,ppogg/YOLOv5-Lite,25.0,61.771484,996.0,15.0,130.0,2022-05-09 08:04:57+00:00,https://github.com/ppogg/YOLOv5-Lite,996.0,0,105.0,0.807692,1.0
3619,1.0,1014.0,105.0,2014-07-18 16:46:14+00:00,274.0,0.0,"BSD 2-Clause ""Simplified"" License",django-json-api/django-rest-framework-json-api,24.0,1.630859,997.0,4.0,359.0,2022-04-28 15:39:52+00:00,https://github.com/django-json-api/django-rest...,997.0,7,335.0,0.933148,1.0


In [24]:
sampled_df.shape

(6126, 20)

In [25]:
df2 = df.groupby('star_category').apply(lambda x: x.sample(408))

In [26]:
df2.corr()

Unnamed: 0,allow_forking,commit_count,contributor_count,forks,github_pages,open_issues,size,star_count,topics_count,total_issue_count,watchers_count,years,closed_issue,issue_close_rate,star_category
allow_forking,,,,,,,,,,,,,,,
commit_count,,1.0,0.454475,0.139328,-0.017991,0.213239,0.227524,0.065266,-0.009784,0.247316,0.065266,0.075223,0.240448,0.036578,0.065055
contributor_count,,0.454475,1.0,0.171684,0.002941,0.170408,0.100576,0.064909,-0.005511,0.280128,0.064909,0.084007,0.287949,0.06945,0.064949
forks,,0.139328,0.171684,1.0,0.046941,0.192343,0.103772,0.365297,0.008719,0.214447,0.365297,0.133029,0.206859,-0.017975,0.362529
github_pages,,-0.017991,0.002941,0.046941,1.0,0.017093,0.007386,0.125487,0.01158,0.003601,0.125487,0.079258,0.000433,0.050863,0.123154
open_issues,,0.213239,0.170408,0.192343,0.017093,1.0,0.114556,0.181204,0.043674,0.778939,0.181204,0.106067,0.68528,-0.016621,0.17886
size,,0.227524,0.100576,0.103772,0.007386,0.114556,1.0,0.04925,0.064499,0.183058,0.04925,0.01538,0.187468,0.053975,0.049317
star_count,,0.065266,0.064909,0.365297,0.125487,0.181204,0.04925,1.0,0.089925,0.193975,1.0,0.15206,0.185528,0.108768,0.994223
topics_count,,-0.009784,-0.005511,0.008719,0.01158,0.043674,0.064499,0.089925,1.0,0.075218,0.089925,-0.204435,0.077775,0.210024,0.087531
total_issue_count,,0.247316,0.280128,0.214447,0.003601,0.778939,0.183058,0.193975,0.075218,1.0,0.193975,0.156008,0.990495,0.197088,0.190538


In [27]:
sampled_df["star_category"].value_counts()

2.0     650
3.0     650
4.0     650
5.0     650
6.0     650
7.0     650
8.0     650
9.0     615
10.0    553
1.0     408
Name: star_category, dtype: int64

In [28]:
X2 = sampled_df.drop(columns=["created_at", "license", "name", "updated_at", "url", "watchers_count", "star_count", "star_category"])
Y2 = sampled_df["star_category"].values

In [29]:
X = df2.drop(columns=["created_at", "license", "name", "updated_at", "url", "watchers_count", "star_count", "star_category"])
Y = df2["star_category"].values

In [58]:
## Normalize the data
"""
from sklearn.preprocessing import normalize, StandardScaler

scaler = StandardScaler()
X2 = normalize(X)
X3 = scaler.fit_transform(X)

X2
"""

array([[9.54400226e-04, 9.75397031e-01, 1.52704036e-02, ...,
        2.86320068e-03, 1.09756026e-01, 7.83971614e-04],
       [3.34601492e-03, 9.46922224e-01, 4.01521791e-02, ...,
        3.34601492e-03, 1.73992776e-01, 3.16350502e-03],
       [3.64519488e-03, 3.86390657e-01, 1.09355846e-02, ...,
        3.64519488e-03, 3.64519488e-03, 3.64519488e-03],
       ...,
       [1.63274716e-03, 6.57997106e-01, 3.91859319e-02, ...,
        1.63274716e-02, 1.84500429e-01, 9.08869110e-04],
       [3.87627939e-04, 1.86061411e-01, 1.55051176e-03, ...,
        2.32576764e-03, 3.79875381e-02, 9.19795111e-05],
       [8.09452778e-04, 6.93701031e-01, 4.04726389e-02, ...,
        3.23781111e-03, 2.75213944e-02, 5.39635185e-04]])

## Preparing Train and Test Data
We will split train to test as 80%, 20&

In [39]:
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X2, Y2, random_state=50, test_size=0.2)

In [70]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4900, 12), (4900,), (1226, 12), (1226,))

## Apply Baseline Machine Learning Model

In [71]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [72]:
for i in range(1, 21):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, y_train)
    preds = neigh.predict(X_test)
    print(f"for neighbour: {i} : accuracy: {accuracy_score(y_test, preds)}")

for neighbour: 1 : accuracy: 0.400489396411093
for neighbour: 2 : accuracy: 0.3107667210440457
for neighbour: 3 : accuracy: 0.27161500815660683
for neighbour: 4 : accuracy: 0.2626427406199021
for neighbour: 5 : accuracy: 0.25774877650897227
for neighbour: 6 : accuracy: 0.24225122349102773
for neighbour: 7 : accuracy: 0.23164763458401305
for neighbour: 8 : accuracy: 0.233278955954323
for neighbour: 9 : accuracy: 0.21859706362153344
for neighbour: 10 : accuracy: 0.21941272430668843
for neighbour: 11 : accuracy: 0.22022838499184338
for neighbour: 12 : accuracy: 0.21615008156606852
for neighbour: 13 : accuracy: 0.21451876019575855
for neighbour: 14 : accuracy: 0.21207177814029363
for neighbour: 15 : accuracy: 0.21044045676998369
for neighbour: 16 : accuracy: 0.20636215334420882
for neighbour: 17 : accuracy: 0.20228384991843393
for neighbour: 18 : accuracy: 0.20799347471451876
for neighbour: 19 : accuracy: 0.2137030995106036
for neighbour: 20 : accuracy: 0.2137030995106036


In [76]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import tree
from sklearn.linear_model import SGDClassifier


models = {
    "SVC": svm.SVC(decision_function_shape='ovo'),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "SGD": SGDClassifier(loss="hinge", penalty="l2", max_iter=100),
    "MNB": MultinomialNB(),
}

for name, model in models.items():

    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    print(f"Accuracy score for {name} : {accuracy_score(y_test, preds)}")

Accuracy score for SVC : 0.11745513866231648
Accuracy score for Naive Bayes : 0.14763458401305057
Accuracy score for Decision Tree : 0.38580750407830344
Accuracy score for SGD : 0.09216965742251224
Accuracy score for MNB : 0.11256117455138662


In [79]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

for i in range(1, X2.shape[1]):

    print("Currently feature", i)
    X2_new = SelectKBest(chi2, k=i).fit_transform(X2, Y2)

    X_train, X_test, y_train, y_test = train_test_split(X2_new, Y2, random_state=50, test_size=0.2)

    from sklearn import svm
    from sklearn.naive_bayes import GaussianNB
    from sklearn import tree
    from sklearn.linear_model import SGDClassifier

    models = {
    "SVC": svm.SVC(decision_function_shape='ovo'),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "SGD": SGDClassifier(loss="hinge", penalty="l2", max_iter=100),
    "MNB": MultinomialNB(),
}

    for name, model in models.items():

        model.fit(X_train, y_train)

        preds = model.predict(X_test)

        print(f"Accuracy score for {name} : {accuracy_score(y_test, preds)}")

Currently feature 1
Accuracy score for SVC : 0.10603588907014681
Accuracy score for Naive Bayes : 0.11500815660685156
Accuracy score for Decision Tree : 0.23735725938009788
Accuracy score for SGD : 0.09461663947797716
Accuracy score for MNB : 0.09135399673735727
Currently feature 2
Accuracy score for SVC : 0.11908646003262642
Accuracy score for Naive Bayes : 0.16884176182707994
Accuracy score for Decision Tree : 0.3735725938009788
Accuracy score for SGD : 0.07830342577487764
Accuracy score for MNB : 0.1068515497553018
Currently feature 3
Accuracy score for SVC : 0.11582381729200653
Accuracy score for Naive Bayes : 0.1362153344208809
Accuracy score for Decision Tree : 0.37520391517128876
Accuracy score for SGD : 0.10766721044045677
Accuracy score for MNB : 0.11092985318107668
Currently feature 4
Accuracy score for SVC : 0.11827079934747145
Accuracy score for Naive Bayes : 0.12153344208809136
Accuracy score for Decision Tree : 0.3825448613376835
Accuracy score for SGD : 0.111745513866231

In [62]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, preds)

array([[37, 16,  6,  5,  5,  2,  0,  1,  2,  0],
       [25, 51, 14, 16, 11,  7,  5,  4,  2,  2],
       [ 8, 17, 51, 17,  5,  6,  4,  3,  4,  3],
       [13,  9,  9, 42,  9, 15, 16,  8,  9,  3],
       [ 2,  2, 14, 22, 41,  3,  3,  8, 12,  5],
       [ 5,  5,  8, 11,  9, 53, 10,  5, 18, 15],
       [ 1,  3,  8, 10, 11,  7, 59, 14,  8,  8],
       [ 4,  4,  7, 14,  7, 14, 13, 56, 12, 10],
       [ 1,  8,  7, 10, 16,  6,  9, 14, 49,  7],
       [ 1,  0,  1,  4,  7,  4,  9, 14,  9, 67]])

In [66]:
"""
import requests
import os
from tqdm import tqdm

def getTotalPRCount(token, reponame):
        try:

            headers = {
                'Authorization': f'token {token}',
                'accept': 'application/vnd.github.v3+json'
            }

            url = f'https://api.github.com/search/issues?q=repo:{reponame}+type:pr' 

            response = requests.get(url, headers=headers)

            return response.json()['total_count'], None
        except Exception as e:
            return 0, e
        
pr_counts = []

for name in tqdm(df2["name"].values):
    cur_pr_count = getTotalPRCount(token, name)
    pr_counts.append(cur_pr_count)
    
"""