In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import plotly.express as px
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [None]:
data=pd.read_csv('Paddle_2021_loc.csv')
data=data[data['actor_login']!='paddle-bot[bot]'].fillna(0)
def categorize(row):
    return 1 if row['pull_merged']!=0 else 0
data['contributer']=data.apply(lambda row:categorize(row),axis=1)

In [None]:
data1=pd.read_csv('Paddle_actors_17-21.csv')
data1.head()

In [None]:
data1=data1.drop(columns=['Unnamed: 0','activity'])

In [None]:
data1.head()

In [None]:
data1=data1.drop(columns=['month'])

In [None]:
data1.hist(bins=30, figsize=(20,15),color='#A50021')

In [None]:
sns.displot(data1['contributor_num'],kde=True,bins=30,rug=True,color='#A50021')

In [None]:
sns.displot(data1['actor_num'],kde=True,bins=30,rug=True,color='#A50021')

In [None]:
hist_data = [data1['contributor_num']]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)

fig.show()

In [None]:
fig=px.scatter(data1,y='contributor_num',x='open_issue',size='actor_num')
fig.update_layout(showlegend=True)
fig.show()

In [None]:
sns.jointplot(x="contributor_num",y="actor_num",data=data1,kind="hex",color='#A50021',ratio=4,space=0,height=8,marginal_kws={'bins':10,'kde':True})

In [None]:
data2=data1
sns.pairplot(data2)

In [None]:
data2.corr()

In [None]:
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(data2.corr(), annot=True, linecolor='white',linewidths=0.1,cmap="RdBu", fmt= '.1f',ax=ax)

In [None]:
data1['Centralization']=data1['contributor_num']/data1['actor_num']
data1.head(10)

In [None]:
X1=data1.drop(columns=['actor_num','contributor_num','Centralization'])
y1=data1['Centralization']

In [None]:
y1

In [None]:
x1_train,x1_test,y1_train,y1_test = train_test_split(X1,y1,test_size=0.2)

model1=xgb.XGBRegressor(max_depth=15,learning_rate= 0.1,subsample=0.5)
model1.fit(x1_train,y1_train)

In [None]:
y1_pred=model1.predict(x1_test)
y1_pred

In [None]:
y1_test

In [172]:
abs_error=metrics.mean_absolute_error(y1_pred,y1_test)
abs_error

0.005584051285284322

In [173]:
RMSE=np.sqrt(metrics.mean_squared_error(y1_test,y1_pred))
RMSE

0.010257772374569311

In [174]:
from sklearn.metrics import explained_variance_score,r2_score
explained_variance_score(y1_test,y1_pred)


0.9691887530554696

In [None]:
fig, ax1 = plt.subplots(figsize=(8,5))
xgb.plot_importance(model1, ax=ax1)

##### PaddlePaddle/Models

In [None]:
paddleModel=pd.read_csv('PaddleModels_17-21.csv')
antDesign=pd.read_csv('ant-design.csv')
echarts=pd.read_csv('apache_echarts.csv')
apollo=pd.read_csv('ApolloAuto.csv')
NervJS=pd.read_csv('NervJS_taro.csv')
tvm=pd.read_csv('apache_tvm.csv')

In [None]:
paddleModel.drop(columns=['Unnamed: 0','month','activity'],inplace=True)
antDesign.drop(columns=['Unnamed: 0','month','activity'],inplace=True)
echarts.drop(columns=['Unnamed: 0','month','activity'],inplace=True)
apollo.drop(columns=['Unnamed: 0','month','activity'],inplace=True)
NervJS.drop(columns=['Unnamed: 0','month','activity'],inplace=True)
tvm.drop(columns=['Unnamed: 0','month','activity'],inplace=True)

In [None]:
data2=data2.drop(columns=['Centralization'])
Data=pd.concat([data2,paddleModel,antDesign,echarts,apollo,NervJS,tvm])
Data

In [None]:
Data['Centralization']=Data['contributor_num']/Data['actor_num']
Data.head(10)

In [175]:
X2=Data.drop(columns=['actor_num','contributor_num'])
y2=Data['Centralization']

x2_train,x2_test,y2_train,y2_test = train_test_split(X2,y2,test_size=0.2)

model2=xgb.XGBRegressor(max_depth=15,learning_rate= 0.1,subsample=0.5)
model2.fit(x2_train,y2_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=15,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.5,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [176]:
y2_pred=model2.predict(x2_test)

In [177]:
abs_error=metrics.mean_absolute_error(y2_pred,y2_test)
abs_error

0.0015791919422017636

In [178]:
RMSE=np.sqrt(metrics.mean_absolute_error(y2_test,y2_pred))
RMSE

0.039739048078706714

In [179]:
explained_variance_score(y2_test,y2_pred)

0.9993514673762636

In [None]:
score=[]
for Max_depth in range(1,31):
    model2=xgb.XGBRegressor(max_depth=Max_depth, gamma=0.3, learning_rate= 0.2)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)

In [None]:
score=[]
for i in range(1,11):
    model2=xgb.XGBRegressor(max_depth=6,min_child_weight=i, gamma=0.3, learning_rate= 0.2)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)

In [None]:
score=[]
for gamma in np.arange(0,1,0.1):
    model2=xgb.XGBRegressor(max_depth=6,min_child_weight=1, gamma=gamma, learning_rate= 0.2)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)

In [None]:
score

In [None]:
score=[]
for learning_rate in np.arange(0,0.5,0.01):
    model2=xgb.XGBRegressor(max_depth=6,min_child_weight=1, gamma=0, learning_rate=learning_rate)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)

In [None]:
score=[]
for alpha in np.arange(0,0.1,0.01):
    model2=xgb.XGBRegressor(max_depth=6,min_child_weight=1, gamma=0, learning_rate=0.06,reg_alpha=alpha)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)

In [None]:
score=[]
for reg_lambda in np.arange(0,1,0.1):
    model2=xgb.XGBRegressor(max_depth=6,min_child_weight=1, gamma=0, learning_rate=0.06,reg_alpha=0,reg_lambda=reg_lambda)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)

In [None]:
score=[]
for subsample in np.arange(0.5,1,0.1):
    model2=xgb.XGBRegressor(max_depth=6,min_child_weight=1, gamma=0, learning_rate=0.06,reg_alpha=0,reg_lambda=1,subsample=subsample)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)

In [None]:
score=[]
for n in np.arange(50,200,10):
    model2=xgb.XGBRegressor(max_depth=6,min_child_weight=1, gamma=0, learning_rate=0.06,reg_alpha=0,reg_lambda=1,subsample=0.9,n_estimators=n)
    model2.fit(x2_train,y2_train)
    y2_pred=model2.predict(x2_test)
    score.append(explained_variance_score(y2_test,y2_pred))

s=max(score)
j=score.index(s)
print(s)
print(j)